@inproceedings{18890,
  abstract     = {Deep Neural Collapse (DNC) refers to the surprisingly rigid structure of the data representations in the final layers of Deep Neural Networks (DNNs). Though the phenomenon has been measured in a variety of settings, its emergence is typically explained via data-agnostic approaches, such as the unconstrained features model. In this work, we introduce a data-dependent setting where DNC forms due to feature learning through the average gradient outer product (AGOP). The AGOP is defined with respect to a learned predictor and is equal to the uncentered covariance matrix of its input-output gradients averaged over the training dataset. The Deep Recursive Feature Machine (Deep RFM) is a method that constructs a neural network by iteratively mapping the data with the AGOP and applying an untrained random feature map. We demonstrate empirically that DNC occurs in Deep RFM across standard settings as a consequence of the projection with the AGOP matrix computed at each layer. Further, we theoretically explain DNC in Deep RFM in an asymptotic setting and as a result of kernel learning. We then provide evidence that this mechanism holds for neural networks more generally. In particular, we show that the right singular vectors and values of the weights can be responsible for the majority of within-class variability collapse for DNNs trained in the feature learning regime. As observed in recent work, this singular structure is highly correlated with that of the AGOP.},
  author       = {Beaglehole, Daniel and Súkeník, Peter and Mondelli, Marco and Belkin, Mikhail},
  booktitle    = {38th Annual Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Average gradient outer product as a mechanism for deep neural collapse}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{18891,
  abstract     = {Deep neural networks (DNNs) exhibit a surprising structure in their final layer
known as neural collapse (NC), and a growing body of works has currently investigated the propagation of neural collapse to earlier layers of DNNs – a phenomenon
called deep neural collapse (DNC). However, existing theoretical results are restricted to special cases: linear models, only two layers or binary classification.
In contrast, we focus on non-linear models of arbitrary depth in multi-class classification and reveal a surprising qualitative shift. As soon as we go beyond two
layers or two classes, DNC stops being optimal for the deep unconstrained features
model (DUFM) – the standard theoretical framework for the analysis of collapse.
The main culprit is a low-rank bias of multi-layer regularization schemes: this bias
leads to optimal solutions of even lower rank than the neural collapse. We support
our theoretical findings with experiments on both DUFM and real data, which show
the emergence of the low-rank structure in the solution found by gradient descent.},
  author       = {Súkeník, Peter and Lampert, Christoph and Mondelli, Marco},
  booktitle    = {38th Annual Conference on Neural Information Processing Systems},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Neural collapse versus low-rank bias: Is deep neural collapse really optimal?}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{18897,
  abstract     = {Score-based generative models (SGMs) are powerful tools to sample from complex data distributions. Their underlying idea is to (i) run a forward process for time T1 by adding noise to the data, (ii) estimate its score function, and (iii) use such estimate to run a reverse process. As the reverse process is initialized with the stationary distribution of the forward one, the existing analysis paradigm requires T1→∞. This is however problematic: from a theoretical viewpoint, for a given precision of the score approximation, the convergence guarantee fails as T1 diverges; from a practical viewpoint, a large T1 increases computational costs and leads to error propagation. This paper addresses the issue by considering a version of the popular predictor-corrector scheme: after running the forward process, we first estimate the final distribution via an inexact Langevin dynamics and then revert the process. Our key technical contribution is to provide convergence guarantees which require to run the forward process only for a fixed finite time T1. Our bounds exhibit a mild logarithmic dependence on the input dimension and the subgaussian norm of the target distribution, have minimal assumptions on the data, and require only to control the L2 loss on the score approximation, which is the quantity minimized in practice.},
  author       = {Pedrotti, Francesco and Maas, Jan and Mondelli, Marco},
  booktitle    = {Transactions on Machine Learning Research},
  issn         = {2835-8856},
  title        = {{Improved convergence of score-based diffusion models via prediction-correction}},
  year         = {2024},
}

@inproceedings{18972,
  abstract     = {Deep learning models are known to overfit and memorize spurious features in the training dataset. While numerous empirical studies have aimed at understanding this phenomenon, a rigorous theoretical framework to quantify it is still missing. In this paper, we consider spurious features that are uncorrelated with the learning task, and we provide a precise characterization of how they are memorized via two separate terms: (i) the stability of the model with respect to individual training samples, and (ii) the feature alignment between the spurious pattern and the full sample. While the first term is well established in learning theory and it is connected to the generalization error in classical work, the second one is, to the best of our knowledge, novel. Our key technical result gives a precise characterization of the feature alignment for the two prototypical settings of random features (RF) and neural tangent kernel (NTK) regression. We prove that the memorization of spurious features weakens as the generalization capability increases and, through the analysis of the feature alignment, we unveil the role of the model and of its activation function. Numerical experiments show the predictive power of our theory on standard datasets (MNIST, CIFAR-10).},
  author       = {Bombari, Simone and Mondelli, Marco},
  booktitle    = {41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {4267--4299},
  publisher    = {ML Research Press},
  title        = {{How spurious features are memorized: Precise analysis for random and NTK features}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{18973,
  abstract     = {Understanding the reasons behind the exceptional success of transformers requires a better analysis of why attention layers are suitable for NLP tasks. In particular, such tasks require predictive models to capture contextual meaning which often depends on one or few words, even if the sentence is long. Our work studies this key property, dubbed word sensitivity (WS), in the prototypical setting of random features. We show that attention layers enjoy high WS, namely, there exists a vector in the space of embeddings that largely perturbs the random attention features map. The argument critically exploits the role of the softmax in the attention layer, highlighting its benefit compared to other activations (e.g., ReLU). In contrast, the WS of standard random features is of order 1/n−−√, n being the number of words in the textual sample, and thus it decays with the length of the context. We then translate these results on the word sensitivity into generalization bounds: due to their low WS, random features provably cannot learn to distinguish between two sentences that differ only in a single word; in contrast, due to their high WS, random attention features have higher generalization capabilities. We validate our theoretical results with experimental evidence over the BERT-Base word embeddings of the imdb review dataset.},
  author       = {Bombari, Simone and Mondelli, Marco},
  booktitle    = {41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {4300--4328},
  publisher    = {ML Research Press},
  title        = {{Towards understanding the word sensitivity of attention layers: A study via random features}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{19518,
  abstract     = {The rising footprint of machine learning has led to a focus on imposing model
sparsity as a means of reducing computational and memory costs. For deep neural
networks (DNNs), the state-of-the-art accuracy-vs-sparsity is achieved by heuristics
inspired by the classical Optimal Brain Surgeon (OBS) framework [LeCun et al.,
1989, Hassibi and Stork, 1992, Hassibi et al., 1993], which leverages loss curvature
information to make better pruning decisions. Yet, these results still lack a solid
theoretical understanding, and it is unclear whether they can be improved by
leveraging connections to the wealth of work on sparse recovery algorithms. In this
paper, we draw new connections between these two areas and present new sparse
recovery algorithms inspired by the OBS framework that comes with theoretical
guarantees under reasonable assumptions and have strong practical performance.
Specifically, our work starts from the observation that we can leverage curvature
information in OBS-like fashion upon the projection step of classic iterative sparse
recovery algorithms such as IHT. We show for the first time that this leads both
to improved convergence bounds under standard assumptions. Furthermore, we
present extensions of this approach to the practical task of obtaining accurate sparse
DNNs, and validate it experimentally at scale for Transformer-based models on
vision and language tasks.},
  author       = {Wu, Diyuan and Modoranu, Ionut-Vlad and Safaryan, Mher and Kuznedelev, Denis and Alistarh, Dan-Adrian},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information}},
  volume       = {37},
  year         = {2024},
}

@article{15172,
  abstract     = {We propose a novel approach to concentration for non-independent random variables. The main idea is to “pretend” that the random variables are independent and pay a multiplicative price measuring how far they are from actually being independent. This price is encapsulated in the Hellinger integral between the joint and the product of the marginals, which is then upper bounded leveraging tensorisation properties. Our bounds represent a natural generalisation of concentration inequalities in the presence of dependence: we recover exactly the classical bounds (McDiarmid’s inequality) when the random variables are independent. Furthermore, in a “large deviations” regime, we obtain the same decay in the probability as for the independent case, even when the random variables display non-trivial dependencies. To show this, we consider a number of applications of interest. First, we provide a bound for Markov chains with finite state space. Then, we consider the Simple Symmetric Random Walk, which is a non-contracting Markov chain, and a non-Markovian setting in which the stochastic process depends on its entire past. To conclude, we propose an application to Markov Chain Monte Carlo methods, where our approach leads to an improved lower bound on the minimum burn-in period required to reach a certain accuracy. In all of these settings, we provide a regime of parameters in which our bound fares better than what the state of the art can provide.},
  author       = {Esposito, Amedeo Roberto and Mondelli, Marco},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {6},
  pages        = {3823--3839},
  publisher    = {IEEE},
  title        = {{Concentration without independence via information measures}},
  doi          = {10.1109/TIT.2024.3367767},
  volume       = {70},
  year         = {2024},
}

@inproceedings{17147,
  abstract     = {Efficient utilization of large-scale biobank data is crucial for inferring the genetic basis of disease and predicting health outcomes from the DNA. Yet we lack efficient, accurate methods that scale to data where electronic health records are linked to whole genome sequence information. To address this issue, our paper develops a new algorithmic paradigm based on Approximate Message Passing (AMP), which is specifically tailored for genomic prediction and association testing. Our method yields comparable out-of-sample prediction accuracy to the state of the art on UK Biobank traits, whilst dramatically improving computational complexity, with a 8x-speed up in the run time. In addition, AMP theory provides a joint association testing framework, which outperforms the currently used REGENIE method, in roughly a third of the compute time. This first, truly large-scale application of the AMP framework lays the foundations for a far wider range of statistical analyses for hundreds of millions of variables measured on millions of people.},
  author       = {Depope, Al and Mondelli, Marco and Robinson, Matthew Richard},
  booktitle    = {2024 IEEE International Conference on Acoustics, Speech, and Signal Processing},
  isbn         = {9798350344851},
  issn         = {1520-6149},
  location     = {Seoul, Korea},
  pages        = {13151--13155},
  publisher    = {IEEE},
  title        = {{Inference of genetic effects via approximate message passing}},
  doi          = {10.1109/ICASSP48485.2024.10447198},
  year         = {2024},
}

@article{17330,
  abstract     = {In this work we consider the list-decodability and list-recoverability of arbitrary q -ary codes, for all integer values of q ⩾ 2. A code is called ( p , L ) q -list-decodable if every radius pn Hamming ball contains less than L codewords; ( p , ℓ, L ) q -recoverability is a generalization where we place radius pn Hamming balls on every point of a combinatorial rectangle with side length ℓ and again stipulate that there be less than L codewords. Our main contribution is to precisely calculate the maximum value of p for which there exist infinite families of positive rate ( p , ℓ, L ) q -list-recoverable codes, the quantity we call the zero-rate threshold . Denoting this value by p *, we in fact show that codes correcting a p * + ε fraction of errors must have size O ε (1), i.e., independent of n . Such a result is typically referred to as a “Plotkin bound.” To complement this, a standard random code with expurgation construction shows that there exist positive rate codes correcting a p * − ε fraction of errors. We also follow a classical proof template (typically attributed to Elias and Bassalygo) to derive from the zero-rate threshold other tradeoffs between rate and decoding radius for list-decoding and list-recovery. Technically, proving the Plotkin bound boils down to demon-strating the Schur convexity of a certain function defined on the q -simplex as well as the convexity of a univariate function derived from it. We remark that an earlier argument claimed similar results for q -ary list-decoding; however, we point out that this earlier proof is flawed.},
  author       = {Resch, Nicolas and Yuan, Chen and Zhang, Yihan},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {9},
  pages        = {6211--6238},
  publisher    = {IEEE},
  title        = {{Zero-rate thresholds and new capacity bounds for list-decoding and list-recovery}},
  doi          = {10.1109/TIT.2024.3430842},
  volume       = {70},
  year         = {2024},
}

@unpublished{17350,
  abstract     = {Score-based generative models (SGMs) are powerful tools to sample from
complex data distributions. Their underlying idea is to (i) run a forward
process for time $T_1$ by adding noise to the data, (ii) estimate its score
function, and (iii) use such estimate to run a reverse process. As the reverse
process is initialized with the stationary distribution of the forward one, the
existing analysis paradigm requires $T_1\to\infty$. This is however
problematic: from a theoretical viewpoint, for a given precision of the score
approximation, the convergence guarantee fails as $T_1$ diverges; from a
practical viewpoint, a large $T_1$ increases computational costs and leads to
error propagation. This paper addresses the issue by considering a version of
the popular predictor-corrector scheme: after running the forward process, we
first estimate the final distribution via an inexact Langevin dynamics and then
revert the process. Our key technical contribution is to provide convergence
guarantees which require to run the forward process only for a fixed finite
time $T_1$. Our bounds exhibit a mild logarithmic dependence on the input
dimension and the subgaussian norm of the target distribution, have minimal
assumptions on the data, and require only to control the $L^2$ loss on the
score approximation, which is the quantity minimized in practice.},
  author       = {Pedrotti, Francesco and Maas, Jan and Mondelli, Marco},
  booktitle    = {arXiv},
  title        = {{Improved convergence of score-based diffusion models via prediction-correction}},
  doi          = {10.48550/arXiv.2305.14164},
  year         = {2024},
}

@phdthesis{17465,
  abstract     = {In the modern age of machine learning, artificial neural networks have become an integral part
of many practical systems. One of the key ingredients of the success of the deep learning
approach is recent computational advances which allowed the training of models with billions
of parameters on large-scale data. Such over-parameterized and data-hungry regimes pose a
challenge for the theoretical analysis of modern models since “classical” statistical wisdom
is no longer applicable. In this view, it is paramount to extend or develop new machinery
that will allow tackling the neural network analysis under new challenging asymptotic regimes,
which is the focus of this thesis.
Large neural network systems are usually optimized via “local” search algorithms, such
as stochastic gradient descent (SGD). However, given the high-dimensional nature of the
parameter space, it is a priori not clear why such a crude “local” approach works so remarkably
well in practice. We take a step towards demystifying this phenomenon by showing that
the landscape of the SGD training dynamics exhibits a few beneficial properties for the
optimization. First, we show that along the SGD trajectory an over-parameterized network
is dropout stable. The emergence of dropout stability allows to conclude that the minima
found by SGD are connected via a continuous path of small loss. This in turn means that
the high-dimensional landscape of the neural network optimization problem is provably not so
unfavourable to gradient-based training, due to mode connectivity. Next, we show that SGD
for an over-parameterized network tends to find solutions that are functionally more “simple”.
This in turn means that the SGD minima are more robust, since a less complicated solution
will less likely overfit the data. More formally, for a prototypical example of a wide two-layer
ReLU network on a 1d regression task we show that the SGD algorithm is implicitly selective in
its choice of an interpolating solution. Namely, at convergence the neural network implements
a piece-wise linear function with the number of linear regions depending only on the amount
of training data. This is in contrast to a “smooth”-like behaviour which one would expect
given such a severe over-parameterization of the model.
Diverging from the generic supervised setting of classification and regression problems, we
analyze an auto-encoder model that is commonly used for representation learning and data
compression. Despite the wide applicability of the auto-encoding paradigm, the theoretical
understanding of their behaviour is limited even in the simplistic shallow case. The related
work is restricted to extreme asymptotic regimes in which the auto-encoder is either severely
over-parameterized or under-parameterized. In contrast, we provide a tight characterization
for the 1-bit compression of Gaussian signals in the challenging proportional regime, i.e., the
input dimension and the size of the compressed representation obey the same asymptotics.
We also show that gradient-based methods are able to find a globally optimal solution and
that the predictions made for Gaussian data extrapolate beyond - to the case of compression
of natural images. Next, we relax the Gaussian assumption and study more structured input
sources. We show that the shallow model is sometimes agnostic to the structure of the data
vii
which results in a Gaussian-like behaviour. We prove that making the decoding component
slightly less shallow is already enough to escape the “curse” of Gaussian performance.
},
  author       = {Shevchenko, Aleksandr},
  issn         = {2663-337X},
  pages        = {232},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{High-dimensional limits in artificial neural networks}},
  doi          = {10.15479/at:ista:17465},
  year         = {2024},
}

@inproceedings{17469,
  abstract     = {Autoencoders are a prominent model in many empirical branches of machine learning and lossy data compression. However, basic theoretical questions remain unanswered even in a shallow two-layer setting. In particular, to what degree does a shallow autoencoder capture the structure of the underlying data distribution? For the prototypical case of the 1-bit compression of sparse Gaussian data, we prove that gradient descent converges to a solution that completely disregards the sparse structure of the input. Namely, the performance of the algorithm is the same as if it was compressing a Gaussian source - with no sparsity. For general data distributions, we give evidence of a phase transition phenomenon in the shape of the gradient descent minimizer, as a function of the data sparsity: below the critical sparsity level, the minimizer is a rotation taken uniformly at random (just like in the compression of non-sparse data); above the critical sparsity, the minimizer is the identity (up to a permutation). Finally, by exploiting a connection with approximate message passing algorithms, we show how to improve upon Gaussian performance for the compression of sparse data: adding a denoising function to a shallow architecture already reduces the loss provably, and a suitable multi-layer decoder leads to a further improvement. We validate our findings on image datasets, such as CIFAR-10 and MNIST.},
  author       = {Kögler, Kevin and Shevchenko, Aleksandr and Hassani, Hamed and Mondelli, Marco},
  booktitle    = {Proceedings of the 41st International Conference on Machine Learning},
  location     = {Vienna, Austria},
  pages        = {24964--25015},
  publisher    = {ML Research Press},
  title        = {{Compression of structured data with autoencoders: Provable benefit of nonlinearities and depth}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{14083,
  abstract     = {In this work we consider the list-decodability and list-recoverability of arbitrary q-ary codes, for all integer values of q ≥ 2. A code is called (p,L)_q-list-decodable if every radius pn Hamming ball contains less than L codewords; (p,𝓁,L)_q-list-recoverability is a generalization where we place radius pn Hamming balls on every point of a combinatorial rectangle with side length 𝓁 and again stipulate that there be less than L codewords.
Our main contribution is to precisely calculate the maximum value of p for which there exist infinite families of positive rate (p,𝓁,L)_q-list-recoverable codes, the quantity we call the zero-rate threshold. Denoting this value by p_*, we in fact show that codes correcting a p_*+ε fraction of errors must have size O_ε(1), i.e., independent of n. Such a result is typically referred to as a "Plotkin bound." To complement this, a standard random code with expurgation construction shows that there exist positive rate codes correcting a p_*-ε fraction of errors. We also follow a classical proof template (typically attributed to Elias and Bassalygo) to derive from the zero-rate threshold other tradeoffs between rate and decoding radius for list-decoding and list-recovery.
Technically, proving the Plotkin bound boils down to demonstrating the Schur convexity of a certain function defined on the q-simplex as well as the convexity of a univariate function derived from it. We remark that an earlier argument claimed similar results for q-ary list-decoding; however, we point out that this earlier proof is flawed.},
  author       = {Resch, Nicolas and Yuan, Chen and Zhang, Yihan},
  booktitle    = {50th International Colloquium on Automata, Languages, and Programming},
  isbn         = {9783959772785},
  issn         = {1868-8969},
  location     = {Paderborn, Germany},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Zero-rate thresholds and new capacity bounds for list-decoding and list-recovery}},
  doi          = {10.4230/LIPIcs.ICALP.2023.99},
  volume       = {261},
  year         = {2023},
}

@article{12838,
  abstract     = {We study the problem of high-dimensional multiple packing in Euclidean space. Multiple packing is a natural generalization of sphere packing and is defined as follows. Let N > 0 and L ∈ Z ≽2 . A multiple packing is a set C of points in R n such that any point in R n lies in the intersection of at most L – 1 balls of radius √ nN around points in C . Given a well-known connection with coding theory, multiple packings can be viewed as the Euclidean analog of list-decodable codes, which are well-studied for finite fields. In this paper, we derive the best known lower bounds on the optimal density of list-decodable infinite constellations for constant L under a stronger notion called average-radius multiple packing. To this end, we apply tools from high-dimensional geometry and large deviation theory.},
  author       = {Zhang, Yihan and Vatedka, Shashank},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {7},
  pages        = {4513--4527},
  publisher    = {IEEE},
  title        = {{Multiple packing: Lower bounds via infinite constellations}},
  doi          = {10.1109/TIT.2023.3260950},
  volume       = {69},
  year         = {2023},
}

@inproceedings{12859,
  abstract     = {Machine learning models are vulnerable to adversarial perturbations, and a thought-provoking paper by Bubeck and Sellke has analyzed this phenomenon through the lens of over-parameterization: interpolating smoothly the data requires significantly more parameters than simply memorizing it. However, this "universal" law provides only a necessary condition for robustness, and it is unable to discriminate between models. In this paper, we address these gaps by focusing on empirical risk minimization in two prototypical settings, namely, random features and the neural tangent kernel (NTK). We prove that, for random features, the model is not robust for any degree of over-parameterization, even when the necessary condition coming from the universal law of robustness is satisfied. In contrast, for even activations, the NTK model meets the universal lower bound, and it is robust as soon as the necessary condition on over-parameterization is fulfilled. This also addresses a conjecture in prior work by Bubeck, Li and Nagaraj. Our analysis decouples the effect of the kernel of the model from an "interaction matrix", which describes the interaction with the test data and captures the effect of the activation. Our theoretical results are corroborated by numerical evidence on both synthetic and standard datasets (MNIST, CIFAR-10).},
  author       = {Bombari, Simone and Kiyani, Shayan and Mondelli, Marco},
  booktitle    = {Proceedings of the 40th International Conference on Machine Learning},
  location     = {Honolulu, HI, United States},
  pages        = {2738--2776},
  publisher    = {ML Research Press},
  title        = {{Beyond the universal law of robustness: Sharper laws for random features and neural tangent kernels}},
  volume       = {202},
  year         = {2023},
}

@article{13269,
  abstract     = {This paper is a collection of results on combinatorial properties of codes for the Z-channel . A Z-channel with error fraction τ takes as input a length- n binary codeword and injects in an adversarial manner up to n τ asymmetric errors, i.e., errors that only zero out bits but do not flip 0’s to 1’s. It is known that the largest ( L - 1)-list-decodable code for the Z-channel with error fraction τ has exponential size (in n ) if τ is less than a critical value that we call the ( L - 1)- list-decoding Plotkin point and has constant size if τ is larger than the threshold. The ( L -1)-list-decoding Plotkin point is known to be L -1/L-1 – L -L/ L-1 , which equals 1/4 for unique-decoding with L -1 = 1. In this paper, we derive various results for the size of the largest codes above and below the list-decoding Plotkin point. In particular, we show that the largest ( L -1)-list-decodable code ε-above the Plotkin point, for any given sufficiently small positive constant ε > 0, has size Θ L (ε -3/2 ) for any L - 1 ≥ 1. We also devise upper and lower bounds on the exponential size of codes below the list-decoding Plotkin point.},
  author       = {Polyanskii, Nikita and Zhang, Yihan},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {10},
  pages        = {6340--6357},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Codes for the Z-channel}},
  doi          = {10.1109/TIT.2023.3292219},
  volume       = {69},
  year         = {2023},
}

@article{13315,
  abstract     = {How do statistical dependencies in measurement noise influence high-dimensional inference? To answer this, we study the paradigmatic spiked matrix model of principal components analysis (PCA), where a rank-one matrix is corrupted by additive noise. We go beyond the usual independence assumption on the noise entries, by drawing the noise from a low-order polynomial orthogonal matrix ensemble. The resulting noise correlations make the setting relevant for applications but analytically challenging. We provide characterization of the Bayes optimal limits of inference in this model. If the spike is rotation invariant, we show that standard spectral PCA is optimal. However, for more general priors, both PCA and the existing approximate message-passing algorithm (AMP) fall short of achieving the information-theoretic limits, which we compute using the replica method from statistical physics. We thus propose an AMP, inspired by the theory of adaptive Thouless–Anderson–Palmer equations, which is empirically observed to saturate the conjectured theoretical limit. This AMP comes with a rigorous state evolution analysis tracking its performance. Although we focus on specific noise distributions, our methodology can be generalized to a wide class of trace matrix ensembles at the cost of more involved expressions. Finally, despite the seemingly strong assumption of rotation-invariant noise, our theory empirically predicts algorithmic performance on real data, pointing at strong universality properties.},
  author       = {Barbier, Jean and Camilli, Francesco and Mondelli, Marco and Sáenz, Manuel},
  issn         = {1091-6490},
  journal      = {Proceedings of the National Academy of Sciences of the United States of America},
  number       = {30},
  publisher    = {National Academy of Sciences},
  title        = {{Fundamental limits in structured principal component analysis and how to reach them}},
  doi          = {10.1073/pnas.2302028120},
  volume       = {120},
  year         = {2023},
}

@inproceedings{13321,
  abstract     = {We consider the problem of reconstructing the signal and the hidden variables from observations coming from a multi-layer network with rotationally invariant weight matrices. The multi-layer structure models inference from deep generative priors, and the rotational invariance imposed on the weights generalizes the i.i.d. Gaussian assumption by allowing for a complex correlation structure, which is typical in applications. In this work, we present a new class of approximate message passing (AMP) algorithms and give a state evolution recursion which precisely characterizes their performance in the large system limit. In contrast with the existing multi-layer VAMP (ML-VAMP) approach, our proposed AMP – dubbed multilayer rotationally invariant generalized AMP (ML-RI-GAMP) – provides a natural generalization beyond Gaussian designs, in the sense that it recovers the existing Gaussian AMP as a special case. Furthermore, ML-RI-GAMP exhibits a significantly lower complexity than ML-VAMP, as the computationally intensive singular value decomposition is replaced by an estimation of the moments of the design matrices. Finally, our numerical results show that this complexity gain comes at little to no cost in the performance of the algorithm.},
  author       = {Xu, Yizhou and Hou, Tian Qi and Liang, Shan Suo and Mondelli, Marco},
  booktitle    = {2023 IEEE Information Theory Workshop},
  isbn         = {9798350301496},
  issn         = {2475-4218},
  location     = {Saint-Malo, France},
  pages        = {294--298},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Approximate message passing for multi-layer estimation in rotationally invariant models}},
  doi          = {10.1109/ITW55543.2023.10160238},
  year         = {2023},
}

@article{14751,
  abstract     = {We consider zero-error communication over a two-transmitter deterministic adversarial multiple access channel (MAC) governed by an adversary who has access to the transmissions of both senders (hence called omniscient ) and aims to maliciously corrupt the communication. None of the encoders, jammer and decoder is allowed to randomize using private or public randomness. This enforces a combinatorial nature of the problem. Our model covers a large family of channels studied in the literature, including all deterministic discrete memoryless noisy or noiseless MACs. In this work, given an arbitrary two-transmitter deterministic omniscient adversarial MAC, we characterize when the capacity region: 1) has nonempty interior (in particular, is two-dimensional); 2) consists of two line segments (in particular, has empty interior); 3) consists of one line segment (in particular, is one-dimensional); 4) or only contains (0,0) (in particular, is zero-dimensional). This extends a recent result by Wang et al. (201 9) from the point-to-point setting to the multiple access setting. Indeed, our converse arguments build upon their generalized Plotkin bound and involve delicate case analysis. One of the technical challenges is to take care of both “joint confusability” and “marginal confusability”. In particular, the treatment of marginal confusability does not follow from the point-to-point results by Wang et al. Our achievability results follow from random coding with expurgation.},
  author       = {Zhang, Yihan},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  keywords     = {Computer Science Applications, Information Systems},
  number       = {7},
  pages        = {4093--4127},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Zero-error communication over adversarial MACs}},
  doi          = {10.1109/tit.2023.3257239},
  volume       = {69},
  year         = {2023},
}

@inproceedings{14921,
  abstract     = {Neural collapse (NC) refers to the surprising structure of the last layer of deep neural networks in the terminal phase of gradient descent training. Recently, an increasing amount of experimental evidence has pointed to the propagation of NC to earlier layers of neural networks. However, while the NC in the last layer is well studied theoretically, much less is known about its multi-layered counterpart - deep neural collapse (DNC). In particular, existing work focuses either on linear layers or only on the last two layers at the price of an extra assumption. Our paper fills this gap by generalizing the established analytical framework for NC - the unconstrained features model - to multiple non-linear layers. Our key technical contribution is to show that, in a deep unconstrained features model, the unique global optimum for binary classification exhibits all the properties typical of DNC. This explains the existing experimental evidence of DNC. We also empirically show that (i) by optimizing deep unconstrained features models via gradient descent, the resulting solution agrees well with our theory, and (ii) trained networks recover the unconstrained features suitable for the occurrence of DNC, thus supporting the validity of this modeling principle.},
  author       = {Súkeník, Peter and Mondelli, Marco and Lampert, Christoph},
  booktitle    = {37th Annual Conference on Neural Information Processing Systems},
  location     = {New Orleans, LA, United States},
  title        = {{Deep neural collapse is provably optimal for the deep unconstrained features model}},
  year         = {2023},
}