@phdthesis{17465,
  abstract     = {In the modern age of machine learning, artificial neural networks have become an integral part
of many practical systems. One of the key ingredients of the success of the deep learning
approach is recent computational advances which allowed the training of models with billions
of parameters on large-scale data. Such over-parameterized and data-hungry regimes pose a
challenge for the theoretical analysis of modern models since “classical” statistical wisdom
is no longer applicable. In this view, it is paramount to extend or develop new machinery
that will allow tackling the neural network analysis under new challenging asymptotic regimes,
which is the focus of this thesis.
Large neural network systems are usually optimized via “local” search algorithms, such
as stochastic gradient descent (SGD). However, given the high-dimensional nature of the
parameter space, it is a priori not clear why such a crude “local” approach works so remarkably
well in practice. We take a step towards demystifying this phenomenon by showing that
the landscape of the SGD training dynamics exhibits a few beneficial properties for the
optimization. First, we show that along the SGD trajectory an over-parameterized network
is dropout stable. The emergence of dropout stability allows to conclude that the minima
found by SGD are connected via a continuous path of small loss. This in turn means that
the high-dimensional landscape of the neural network optimization problem is provably not so
unfavourable to gradient-based training, due to mode connectivity. Next, we show that SGD
for an over-parameterized network tends to find solutions that are functionally more “simple”.
This in turn means that the SGD minima are more robust, since a less complicated solution
will less likely overfit the data. More formally, for a prototypical example of a wide two-layer
ReLU network on a 1d regression task we show that the SGD algorithm is implicitly selective in
its choice of an interpolating solution. Namely, at convergence the neural network implements
a piece-wise linear function with the number of linear regions depending only on the amount
of training data. This is in contrast to a “smooth”-like behaviour which one would expect
given such a severe over-parameterization of the model.
Diverging from the generic supervised setting of classification and regression problems, we
analyze an auto-encoder model that is commonly used for representation learning and data
compression. Despite the wide applicability of the auto-encoding paradigm, the theoretical
understanding of their behaviour is limited even in the simplistic shallow case. The related
work is restricted to extreme asymptotic regimes in which the auto-encoder is either severely
over-parameterized or under-parameterized. In contrast, we provide a tight characterization
for the 1-bit compression of Gaussian signals in the challenging proportional regime, i.e., the
input dimension and the size of the compressed representation obey the same asymptotics.
We also show that gradient-based methods are able to find a globally optimal solution and
that the predictions made for Gaussian data extrapolate beyond - to the case of compression
of natural images. Next, we relax the Gaussian assumption and study more structured input
sources. We show that the shallow model is sometimes agnostic to the structure of the data
vii
which results in a Gaussian-like behaviour. We prove that making the decoding component
slightly less shallow is already enough to escape the “curse” of Gaussian performance.
},
  author       = {Shevchenko, Aleksandr},
  issn         = {2663-337X},
  pages        = {232},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{High-dimensional limits in artificial neural networks}},
  doi          = {10.15479/at:ista:17465},
  year         = {2024},
}

@inproceedings{17469,
  abstract     = {Autoencoders are a prominent model in many empirical branches of machine learning and lossy data compression. However, basic theoretical questions remain unanswered even in a shallow two-layer setting. In particular, to what degree does a shallow autoencoder capture the structure of the underlying data distribution? For the prototypical case of the 1-bit compression of sparse Gaussian data, we prove that gradient descent converges to a solution that completely disregards the sparse structure of the input. Namely, the performance of the algorithm is the same as if it was compressing a Gaussian source - with no sparsity. For general data distributions, we give evidence of a phase transition phenomenon in the shape of the gradient descent minimizer, as a function of the data sparsity: below the critical sparsity level, the minimizer is a rotation taken uniformly at random (just like in the compression of non-sparse data); above the critical sparsity, the minimizer is the identity (up to a permutation). Finally, by exploiting a connection with approximate message passing algorithms, we show how to improve upon Gaussian performance for the compression of sparse data: adding a denoising function to a shallow architecture already reduces the loss provably, and a suitable multi-layer decoder leads to a further improvement. We validate our findings on image datasets, such as CIFAR-10 and MNIST.},
  author       = {Kögler, Kevin and Shevchenko, Aleksandr and Hassani, Hamed and Mondelli, Marco},
  booktitle    = {Proceedings of the 41st International Conference on Machine Learning},
  location     = {Vienna, Austria},
  pages        = {24964--25015},
  publisher    = {ML Research Press},
  title        = {{Compression of structured data with autoencoders: Provable benefit of nonlinearities and depth}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{14459,
  abstract     = {Autoencoders are a popular model in many branches of machine learning and lossy data compression. However, their fundamental limits, the performance of gradient methods and the features learnt during optimization remain poorly understood, even in the two-layer setting. In fact, earlier work has considered either linear autoencoders or specific training regimes (leading to vanishing or diverging compression rates). Our paper addresses this gap by focusing on non-linear two-layer autoencoders trained in the challenging proportional regime in which the input dimension scales linearly with the size of the representation. Our results characterize the minimizers of the population risk, and show that such minimizers are achieved by gradient methods; their structure is also unveiled, thus leading to a concise description of the features obtained via training. For the special case of a sign activation function, our analysis establishes the fundamental limits for the lossy compression of Gaussian sources via (shallow) autoencoders. Finally, while the results are proved for Gaussian data, numerical simulations on standard datasets display the universality of the theoretical predictions.},
  author       = {Shevchenko, Aleksandr and Kögler, Kevin and Hassani, Hamed and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Honolulu, Hawaii, HI, United States},
  pages        = {31151--31209},
  publisher    = {ML Research Press},
  title        = {{Fundamental limits of two-layer autoencoders, and achieving them with gradient methods}},
  volume       = {202},
  year         = {2023},
}

@article{11420,
  abstract     = {Understanding the properties of neural networks trained via stochastic gradient descent (SGD) is at the heart of the theory of deep learning. In this work, we take a mean-field view, and consider a two-layer ReLU network trained via noisy-SGD for a univariate regularized regression problem. Our main result is that SGD with vanishingly small noise injected in the gradients is biased towards a simple solution: at convergence, the ReLU network implements a piecewise linear map of the inputs, and the number of “knot” points -- i.e., points where the tangent of the ReLU network estimator changes -- between two consecutive training inputs is at most three. In particular, as the number of neurons of the network grows, the SGD dynamics is captured by the solution of a gradient flow and, at convergence, the distribution of the weights approaches the unique minimizer of a related free energy, which has a Gibbs form. Our key technical contribution consists in the analysis of the estimator resulting from this minimizer: we show that its second derivative vanishes everywhere, except at some specific locations which represent the “knot” points. We also provide empirical evidence that knots at locations distinct from the data points might occur, as predicted by our theory.},
  author       = {Shevchenko, Aleksandr and Kungurtsev, Vyacheslav and Mondelli, Marco},
  issn         = {1533-7928},
  journal      = {Journal of Machine Learning Research},
  number       = {130},
  pages        = {1--55},
  publisher    = {Journal of Machine Learning Research},
  title        = {{Mean-field analysis of piecewise linear solutions for wide ReLU networks}},
  volume       = {23},
  year         = {2022},
}

@inproceedings{9198,
  abstract     = {The optimization of multilayer neural networks typically leads to a solution
with zero training error, yet the landscape can exhibit spurious local minima
and the minima can be disconnected. In this paper, we shed light on this
phenomenon: we show that the combination of stochastic gradient descent (SGD)
and over-parameterization makes the landscape of multilayer neural networks
approximately connected and thus more favorable to optimization. More
specifically, we prove that SGD solutions are connected via a piecewise linear
path, and the increase in loss along this path vanishes as the number of
neurons grows large. This result is a consequence of the fact that the
parameters found by SGD are increasingly dropout stable as the network becomes
wider. We show that, if we remove part of the neurons (and suitably rescale the
remaining ones), the change in loss is independent of the total number of
neurons, and it depends only on how many neurons are left. Our results exhibit
a mild dependence on the input dimension: they are dimension-free for two-layer
networks and depend linearly on the dimension for multilayer networks. We
validate our theoretical findings with numerical experiments for different
architectures and classification tasks.},
  author       = {Shevchenko, Aleksandr and Mondelli, Marco},
  booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
  pages        = {8773--8784},
  publisher    = {ML Research Press},
  title        = {{Landscape connectivity and dropout stability of SGD solutions for over-parameterized neural networks}},
  volume       = {119},
  year         = {2020},
}

