@article{21488,
  abstract     = {Human height is a model for the genetic analysis of complex traits, and recent studies suggest the presence of thousands of common genetic variant associations and hundreds of low-frequency/rare variants. Here, we develop a new algorithmic paradigm based on approximate message passing (genomic vector approximate message passing [gVAMP]) for identifying DNA sequence variants associated with complex traits and common diseases in large-scale whole-genome sequencing (WGS) data. We show that gVAMP accurately localizes associations to variants with the correct frequency and position in the DNA, outperforming existing fine-mapping methods in selecting the appropriate genetic variants within WGS data. We then apply gVAMP to jointly model the relationship of tens of millions of WGS variants with human height in hundreds of thousands of UK Biobank individuals. We identify 59 rare variants and gene burden scores alongside many hundreds of DNA regions containing common variant associations and show that understanding the genetic basis of complex traits will require the joint analysis of hundreds of millions of variables measured on millions of people. The polygenic risk scores obtained from gVAMP have high accuracy (including a prediction accuracy of ∼46% for human height) and outperform current methods for downstream tasks such as mixed linear model association testing across 13 UK Biobank traits. In conclusion, gVAMP offers a scalable foundation for a wider range of analyses in WGS data.},
  author       = {Depope, Al and Bajzik, Jakub and Mondelli, Marco and Robinson, Matthew Richard},
  issn         = {2666-979X},
  journal      = {Cell Genomics},
  publisher    = {Elsevier},
  title        = {{Joint modeling of whole-genome sequencing data for human height via approximate message passing}},
  doi          = {10.1016/j.xgen.2026.101162},
  year         = {2026},
}

@inproceedings{20033,
  abstract     = {A growing number of machine learning scenarios rely on knowledge distillation where one uses the output of a surrogate model as labels to supervise the training of a target model. In this work, we provide a sharp characterization of this process for ridgeless, high-dimensional regression, under two settings: (i) model shift, where the surrogate model is arbitrary, and (ii) distribution shift, where the surrogate model is the solution of empirical risk minimization with out-of-distribution data. In both cases, we characterize the precise risk of the target model through non-asymptotic bounds in terms of sample size and data distribution under mild conditions. As a consequence, we identify the form of the optimal surrogate model, which reveals the benefits and limitations of discarding weak features in a data-dependent fashion. In the context of weak-to-strong (W2S) generalization, this has the interpretation that (i) W2S training, with the surrogate as the weak model, can provably outperform training with strong labels under the same data budget, but (ii) it is unable to improve the data scaling law. We validate our results on numerical experiments both on ridgeless regression and on neural network architectures.},
  author       = {Emrullah Ildiz, M. and Gozeten, Halil Alperen and Taga, Ege Onur and Mondelli, Marco and Oymak, Samet},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {2967--3006},
  publisher    = {ICLR},
  title        = {{High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws}},
  year         = {2025},
}

@inproceedings{20035,
  abstract     = {Deep neural networks (DNNs) at convergence consistently represent the training data in the last layer via a geometric structure referred to as neural collapse. This empirical evidence has spurred a line of theoretical research aimed at proving the emergence of neural collapse, mostly focusing on the unconstrained features model. Here, the features of the penultimate layer are free variables, which makes the model data-agnostic and puts into question its ability to capture DNN training. Our work addresses the issue, moving away from unconstrained features and
studying DNNs that end with at least two linear layers. We first prove generic guarantees on neural collapse that assume (i) low training error and balancedness of linear layers (for within-class variability collapse), and (ii) bounded conditioning of the features before the linear part (for orthogonality of class-means, and their alignment with weight matrices). The balancedness refers to the fact that W⊤ℓ+1Wℓ+1 ≈ WℓW⊤ℓfor any pair of consecutive weight matrices of the linear part, and the bounded conditioning requires a well-behaved ratio between largest and smallest non-zero singular values of the features. We then show that such assumptions hold for gradient descent training with weight decay: (i) for networks with a wide first layer, we prove low training error and balancedness, and (ii) for solutions that are either nearly optimal or stable under large learning rates, we additionally prove the bounded conditioning. Taken together, our results are the first to show neural collapse in the end-to-end training of DNNs.},
  author       = {Jacot, Arthur and Súkeník, Peter and Wang, Zihan and Mondelli, Marco},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {1905--1931},
  publisher    = {ICLR},
  title        = {{Wide neural networks trained with weight decay provably exhibit neural collapse}},
  year         = {2025},
}

@article{20081,
  abstract     = {Information measures can be constructed from Rényi divergences much like mutual information from Kullback-Leibler divergence. One such information measure is known as Sibson α-mutual information and has received renewed attention recently in several contexts: concentration of measure under dependence, statistical learning, hypothesis testing, and estimation theory. In this paper, we survey and extend the state of the art. In particular, we introduce variational representations for Sibson α-mutual information and employ them in each described context to derive novel results. Namely, we produce generalized Transportation-Cost inequalities and Fano-type inequalities. We also present an overview of known applications, spanning from learning theory and Bayesian risk to universal prediction.},
  author       = {Esposito, Amedeo Roberto and Gastpar, Michael and Issa, Ibrahim},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  publisher    = {IEEE},
  title        = {{Sibson α-mutual information and its variational representations}},
  doi          = {10.1109/TIT.2025.3587340},
  year         = {2025},
}

@inproceedings{20300,
  abstract     = {Simultaneously addressing multiple objectives is becoming increasingly important in modern machine learning. At the same time, data is often high-dimensional and costly to label. For a single objective such as prediction risk, conventional regularization techniques are known to improve generalization when the data exhibits low-dimensional structure like sparsity. However, it is largely unexplored how to leverage this structure in the context of multi-objective learning (MOL) with multiple competing objectives. In this work, we discuss how the application of vanilla regularization approaches can fail, and propose a two-stage MOL framework that can successfully leverage low-dimensional structure. We demonstrate its effectiveness experimentally for multi-distribution learning and fairness-risk trade-offs.},
  author       = {Wegel, Tobias and Kovačević, Filip and Ţifrea, Alexandru and Yang, Fanny},
  booktitle    = {The 28th International Conference on Artificial Intelligence and Statistics},
  issn         = {2640-3498},
  location     = {Mai Khao, Thailand},
  pages        = {4591--4599},
  publisher    = {ML Research Press},
  title        = {{Learning Pareto manifolds in high dimensions: How can regularization help?}},
  volume       = {258},
  year         = {2025},
}

@inproceedings{20667,
  abstract     = {We explore the problem of mean estimation for a high-dimensional binary symmetric Gaussian mixture model, where the label (sign) follows a time-inhomogeneous Markov chain. We propose a spectral estimator based on a partition of a subset of the samples to blocks. We develop a computationally efficient algorithm to find the optimal blocks, and derive minimax lower bounds on the estimation loss of any estimator, which establish the effectiveness of our proposed estimator. The resulting minimax rate illuminates the interplay between the sample size, dimension, signal strength, and the memory on the loss.},
  author       = {El Latif Kadry, Abd and Zhang, Yihan and Weinberger, Nir},
  booktitle    = {2025 IEEE International Symposium on Information Theory Proceedings},
  isbn         = {9798331543990},
  issn         = {2157-8095},
  location     = {Ann Arbor, MI, United States},
  publisher    = {IEEE},
  title        = {{Mean estimation in high-dimensional binary timeinhomogeneous Markov Gaussian mixture models}},
  doi          = {10.1109/ISIT63088.2025.11195426},
  year         = {2025},
}

@article{20734,
  abstract     = {We consider the problem of parameter estimation in a high-dimensional generalized linear model. Spectral methods obtained via the principal eigenvector of a suitable data-dependent matrix provide a simple yet surprisingly effective solution. However, despite their wide use, a rigorous performance characterization, as well as a principled way to preprocess the data, are available only for unstructured (i.i.d. Gaussian and Haar orthogonal) designs. In contrast, real-world data matrices are highly structured and exhibit non-trivial correlations. To address the problem, we consider correlated Gaussian designs capturing the anisotropic nature of the features via a covariance matrix Σ. Our main result is a precise asymptotic characterization of the performance of spectral estimators. This allows us to identify the optimal preprocessing that minimizes the number of samples needed for parameter estimation. Surprisingly, such preprocessing is universal across a broad set of designs, which partly addresses a conjecture on optimal spectral estimators for rotationally invariant models. Our principled approach vastly improves upon previous heuristic methods, including for designs common in computational imaging and genetics. The proposed methodology, based on approximate message passing, is broadly applicable and opens the way to the precise characterization of spiked matrices and of the corresponding spectral methods in a variety of settings.},
  author       = {Zhang, Yihan and Ji, Hong Chang and Venkataramanan, Ramji and Mondelli, Marco},
  issn         = {2520-2324},
  journal      = {Mathematical Statistics and Learning},
  number       = {3-4},
  pages        = {193--304},
  publisher    = {EMS Press},
  title        = {{Spectral estimators for structured generalized linear models via approximate message passing}},
  doi          = {10.4171/MSL/52},
  volume       = {8},
  year         = {2025},
}

@article{19065,
  abstract     = {The identification of the parameters of a neural network from finite samples of input-output pairs is often referred to as the teacher-student model, and this model has represented a popular framework for understanding training and generalization. Even if the problem is NP-complete in the worst case, a rapidly growing literature – after adding suitable distributional assumptions – has established finite sample identification of two-layer networks with a number of neurons (math. formula), D being the input dimension. For the range (math. formula) the problem becomes harder, and truly little is known for networks parametrized by biases as well. This paper fills the gap by providing efficient algorithms and rigorous theoretical guarantees of finite sample identification for such wider shallow networks with biases. Our approach is based on a two-step pipeline: first, we recover the direction of the weights, by exploiting second order information; next, we identify the signs by suitable algebraic evaluations, and we recover the biases by empirical risk minimization via gradient descent. Numerical results demonstrate the effectiveness of our approach.},
  author       = {Fornasier, Massimo and Klock, Timo and Mondelli, Marco and Rauchensteiner, Michael},
  issn         = {1096-603X},
  journal      = {Applied and Computational Harmonic Analysis},
  publisher    = {Elsevier},
  title        = {{Efficient identification of wide shallow neural networks with biases}},
  doi          = {10.1016/j.acha.2025.101749},
  volume       = {77},
  year         = {2025},
}

@inproceedings{19281,
  abstract     = {In this work, we consider the list-decodability and list-recoverability of codes in the zero-rate regime. Briefly, a code 𝒞 ⊆ [q]ⁿ is (p,𝓁,L)-list-recoverable if for all tuples of input lists (Y₁,… ,Y_n) with each Y_i ⊆ [q] and |Y_i| = 𝓁, the number of codewords c ∈ 𝒞 such that c_i ∉ Y_i for at most pn choices of i ∈ [n] is less than L; list-decoding is the special case of 𝓁 = 1. In recent work by Resch, Yuan and Zhang (ICALP 2023) the zero-rate threshold for list-recovery was determined for all parameters: that is, the work explicitly computes p_*: = p_*(q,𝓁,L) with the property that for all ε > 0 (a) there exist positive-rate (p_*-ε,𝓁,L)-list-recoverable codes, and (b) any (p_*+ε,𝓁,L)-list-recoverable code has rate 0. In fact, in the latter case the code has constant size, independent on n. However, the constant size in their work is quite large in 1/ε, at least |𝒞| ≥ (1/(ε))^O(q^L).
Our contribution in this work is to show that for all choices of q,𝓁 and L with q ≥ 3, any (p_*+ε,𝓁,L)-list-recoverable code must have size O_{q,𝓁,L}(1/ε), and furthermore this upper bound is complemented by a matching lower bound Ω_{q,𝓁,L}(1/ε). This greatly generalizes work by Alon, Bukh and Polyanskiy (IEEE Trans. Inf. Theory 2018) which focused only on the case of binary alphabet (and thus necessarily only list-decoding). We remark that we can in fact recover the same result for q = 2 and even L, as obtained by Alon, Bukh and Polyanskiy: we thus strictly generalize their work. 
Our main technical contribution is to (a) properly define a linear programming relaxation of the list-recovery condition over large alphabets; and (b) to demonstrate that a certain function defined on a q-ary probability simplex is maximized by the uniform distribution. This represents the core challenge in generalizing to larger q (as a binary simplex can be naturally identified with a one-dimensional interval). We can subsequently re-utilize certain Schur convexity and convexity properties established for a related function by Resch, Yuan and Zhang along with ideas of Alon, Bukh and Polyanskiy.},
  author       = {Resch, Nicolas and Yuan, Chen and Zhang, Yihan},
  booktitle    = {16th Innovations in Theoretical Computer Science Conference},
  isbn         = {9783959773614},
  issn         = {1868-8969},
  location     = {New York, NY, United States},
  publisher    = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
  title        = {{Tight bounds on list-decodable and list-recoverable zero-rate codes}},
  doi          = {10.4230/LIPIcs.ITCS.2025.82},
  volume       = {325},
  year         = {2025},
}

@inproceedings{21324,
  abstract     = {Learning models have been shown to rely on spurious correlations between non-predictive features and the associated labels in the training data, with negative implications on robustness, bias and fairness. In this work, we provide a statistical characterization of this phenomenon for high-dimensional regression, when the data contains a predictive core feature x and a spurious feature y. Specifically, we quantify the amount of spurious correlations C learned via linear regression, in terms of the data covariance and the strength λ of the ridge regularization. As a consequence, we first capture the simplicity of y through the spectrum of its covariance, and its correlation with x through the Schur complement of the full data covariance. Next, we prove a trade-off between C and the in-distribution test loss L, by showing that the value of λ that minimizes L lies in an interval where C is increasing. Finally, we investigate the effects of over-parameterization via the random features model, by showing its equivalence to regularized linear regression. Our theoretical results are supported by numerical experiments on Gaussian, Color-MNIST, and CIFAR-10 datasets.},
  author       = {Bombari, Simone and Mondelli, Marco},
  booktitle    = {Proceedings of the 42nd International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vancouver, Canada},
  pages        = {4839--4873},
  publisher    = {ML Research Press},
  title        = {{Spurious correlations in high dimensional regression: The roles of regularization, simplicity bias and over-parameterization}},
  volume       = {267},
  year         = {2025},
}

@inproceedings{21325,
  abstract     = {Test-time training (TTT) methods explicitly update the weights of a model to adapt to the specific test instance, and they have found success in a variety of settings, including most recently language modeling and reasoning. To demystify this success, we investigate a gradient-based TTT algorithm for in-context learning, where we train a transformer model on the in-context demonstrations provided in the test prompt. Specifically, we provide a comprehensive theoretical characterization of linear transformers when the update rule is a single gradient step. Our theory (i) delineates the role of alignment between pretraining distribution and target task, (ii) demystifies how TTT can alleviate distribution shift, and (iii) quantifies the sample complexity of TTT including how it can significantly reduce the eventual sample size required for in-context learning. As our empirical contribution, we study the benefits of TTT for TabPFN, a tabular foundation model. In line with our theory, we demonstrate that TTT significantly reduces the required sample size for tabular classification (3 to 5 times fewer) unlocking substantial inference efficiency with a negligible training cost.},
  author       = {Gozeten, Halil Alperen and Ildiz, Muhammed Emrullah and Zhang, Xuechen and Soltanolkotabi, Mahdi and Mondelli, Marco and Oymak, Samet},
  booktitle    = {Proceedings of the 42nd International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vancouver, Canada},
  pages        = {20266--20295},
  publisher    = {ML Research Press},
  title        = {{Test-time training provably improves transformers as in-context learners}},
  volume       = {267},
  year         = {2025},
}

@inproceedings{21326,
  abstract     = {Neural Collapse is a phenomenon where the last-layer representations of a well-trained neural network converge to a highly structured geometry. In this paper, we focus on its first (and most basic) property, known as NC1: the within-class variability vanishes. While prior theoretical studies establish the occurrence of NC1 via the data-agnostic unconstrained features model, our work adopts a data-specific perspective, analyzing NC1 in a three-layer neural network, with the first two layers operating in the mean-field regime and followed by a linear layer. In particular, we establish a fundamental connection between NC1 and the loss landscape: we prove that points with small empirical loss and gradient norm (thus, close to being stationary) approximately satisfy NC1, and the closeness to NC1 is controlled by the residual loss and gradient norm. We then show that (i) gradient flow on the mean squared error converges to NC1 solutions with small empirical loss, and (ii) for well-separated data distributions, both NC1 and vanishing test loss are achieved simultaneously. This aligns with the empirical observation that NC1 emerges during training while models attain near-zero test error. Overall, our results demonstrate that NC1 arises from gradient training due to the properties of the loss landscape, and they show the co-occurrence of NC1 and small test error for certain data distributions.},
  author       = {Wu, Diyuan and Mondelli, Marco},
  booktitle    = {Proceedings of the 42nd International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vancouver, Canada},
  pages        = {67499--67536},
  publisher    = {ML Research Press},
  title        = {{Neural collapse beyond the unconstrained features model: Landscape, dynamics, and generalization in the mean-field regime}},
  volume       = {267},
  year         = {2025},
}

@inproceedings{21328,
  abstract     = {Multi-index models provide a popular framework to investigate the learnability of functions with low-dimensional structure and, also due to their connections with neural networks, they have been object of recent intensive study. In this paper, we focus on recovering the subspace spanned by the signals via spectral estimators – a family of methods routinely used in practice, often as a warm-start for iterative algorithms. Our main technical contribution is a precise asymptotic characterization of the performance of spectral methods, when sample size and input dimension grow proportionally and the dimension p of the space to recover is fixed. Specifically, we locate the top-p eigenvalues of the spectral matrix and establish the overlaps between the corresponding eigenvectors (which give the spectral estimators) and a basis of the signal subspace. Our analysis unveils a phase transition phenomenon in which, as the sample complexity grows, eigenvalues escape from the bulk of the spectrum and, when that happens, eigenvectors recover directions of the desired subspace. The precise characterization we put forward enables the optimization of the data preprocessing, thus allowing to identify the spectral estimator that requires the minimal sample size for weak recovery.},
  author       = {Kovačević, Filip and Yihan, Zhang and Mondelli, Marco},
  booktitle    = {Proceedings of 38th Conference on Learning Theory},
  issn         = {2640-3498},
  location     = {Lyon, France},
  pages        = {3354--3404},
  publisher    = {ML Research Press},
  title        = {{Spectral estimators for multi-index models: Precise asymptotics and optimal weak recovery}},
  volume       = {291},
  year         = {2025},
}

@article{18986,
  abstract     = {We consider a prototypical problem of Bayesian inference for a structured spiked model: a low-rank signal is corrupted by additive noise. While both information-theoretic and algorithmic limits are well understood when the noise is a Gaussian Wigner matrix, the more realistic case of structured noise still remains challenging. To capture the structure while maintaining mathematical tractability, a line of work has focused on rotationally invariant noise. However, existing studies either provide suboptimal algorithms or are limited to a special class of noise ensembles. In this paper, using tools from statistical physics (replica method) and random matrix theory (generalized spherical integrals) we establish the characterization of the information-theoretic limits for a noise matrix drawn from a general trace ensemble. Remarkably, our analysis unveils the asymptotic equivalence between the rotationally invariant model and a surrogate Gaussian one. Finally, we show how to saturate the predicted statistical limits using an efficient algorithm inspired by the theory of adaptive Thouless-Anderson-Palmer (TAP) equations.},
  author       = {Barbier, Jean and Camilli, Francesco and Xu, Yizhou and Mondelli, Marco},
  issn         = {2643-1564},
  journal      = {Physical Review Research},
  publisher    = {American Physical Society},
  title        = {{Information limits and Thouless-Anderson-Palmer equations for spiked matrix models with structured noise}},
  doi          = {10.1103/PhysRevResearch.7.013081},
  volume       = {7},
  year         = {2025},
}

@article{19627,
  abstract     = {Differentially private gradient descent (DP-GD) is a popular algorithm to train deep learning models with provable guarantees on the privacy of the training data. In the last decade, the problem of understanding its performance cost with respect to standard GD has received remarkable attention from the research community, which formally derived upper bounds on the excess population risk  RP  in different learning settings. However, existing bounds typically degrade with over-parameterization, i.e., as the number of parameters  p  gets larger than the number of training samples  n  -- a regime which is ubiquitous in current deep-learning practice. As a result, the lack of theoretical insights leaves practitioners without clear guidance, leading some to reduce the effective number of trainable parameters to improve performance, while others use larger models to achieve better results through scale. In this work, we show that in the popular random features model with quadratic loss, for any sufficiently large  p , privacy can be obtained for free, i.e.,  |RP|=o(1) , not only when the privacy parameter  ε  has constant order, but also in the strongly private setting  ε=o(1) . This challenges the common wisdom that over-parameterization inherently hinders performance in private learning.},
  author       = {Bombari, Simone and Mondelli, Marco},
  issn         = {1091-6490},
  journal      = {Proceedings of the National Academy of Sciences},
  number       = {15},
  publisher    = {National Academy of Sciences},
  title        = {{Privacy for free in the overparameterized regime}},
  doi          = {10.1073/pnas.2423072122},
  volume       = {122},
  year         = {2025},
}

@article{14665,
  abstract     = {We derive lower bounds on the maximal rates for multiple packings in high-dimensional Euclidean spaces. For any N > 0 and L ∈ Z ≥2 , a multiple packing is a set C of points in R n such that any point in R n lies in the intersection of at most L - 1 balls of radius √ nN around points in C . This is a natural generalization of the sphere packing problem. We study the multiple packing problem for both bounded point sets whose points have norm at most √ nP for some constant P > 0, and unbounded point sets whose points are allowed to be anywhere in R n . Given a well-known connection with coding theory, multiple packings can be viewed as the Euclidean analog of list-decodable codes, which are well-studied over finite fields. We derive the best known lower bounds on the optimal multiple packing density. This is accomplished by establishing an inequality which relates the list-decoding error exponent for additive white Gaussian noise channels, a quantity of average-case nature, to the list-decoding radius, a quantity of worst-case nature. We also derive novel bounds on the list-decoding error exponent for infinite constellations and closed-form expressions for the list-decoding error exponents for the power-constrained AWGN channel, which may be of independent interest beyond multiple packing.},
  author       = {Zhang, Yihan and Vatedka, Shashank},
  issn         = {1557-9654},
  journal      = {IEEE Transactions on Information Theory},
  number       = {2},
  pages        = {1008--1039},
  publisher    = {IEEE},
  title        = {{Multiple packing: Lower bounds via error exponents}},
  doi          = {10.1109/TIT.2023.3334032},
  volume       = {70},
  year         = {2024},
}

@inproceedings{17893,
  abstract     = {Strong data processing inequalities (SDPI) are an important object of study in Information Theory and have been well studied for f -divergences. Universal upper and lower bounds have been provided along with several applications, connecting them to impossibility (converse) results, concentration of measure, hypercontractivity, and so on. In this paper, we study Renyi divergence and the corresponding SDPI constant whose behavior seems to deviate from that of ordinary <1>-divergences. In particular, one can find examples showing that the universal upper bound relating its SDPI constant to the one of Total Variation does not hold in general. In this work, we prove, however, that the universal lower bound involving the SDPI constant of the Chi-square divergence does indeed hold. Furthermore, we also provide a characterization of the distribution that achieves the supremum when is equal to 2 and consequently compute the SDPI constant for Renyi divergence of the general binary channel.},
  author       = {Jin, Lifu and Esposito, Amedeo Roberto and Gastpar, Michael},
  booktitle    = {Proceedings of the 2024 IEEE International Symposium on Information Theory},
  isbn         = {9798350382846},
  issn         = {2157-8095},
  location     = {Athens, Greece},
  pages        = {3178--3183},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Properties of the strong data processing constant for Rényi divergence}},
  doi          = {10.1109/ISIT57864.2024.10619367},
  year         = {2024},
}

@inproceedings{17894,
  abstract     = {Sibson's α -mutual information has received renewed attention recently in several contexts: concentration of measure under dependence, statistical learning, hypothesis testing, and estimation theory. In this work, we introduce several variational representations of Sibson's α -mutual information: 1) as a supremum over joint distributions of (a combination of) KL divergences; and 2) as a supremum over functions of opportune expected values. Leveraging them, we produce a variety of novel and known results, including a generalization of transportation-cost inequalities and Fano's inequality.},
  author       = {Esposito, Amedeo Roberto and Gastpar, Michael and Issa, Ibrahim},
  booktitle    = {Proceedings of the 2024 IEEE International Symposium on Information Theory },
  isbn         = {9798350382846},
  issn         = {2157-8095},
  location     = {Athens, Greece},
  pages        = {2110--2115},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Variational characterizations of Sibson's α-mutual information}},
  doi          = {10.1109/ISIT57864.2024.10619378},
  year         = {2024},
}

@inproceedings{17895,
  abstract     = {We propose a concatenated code construction for a class of discrete-alphabet oblivious arbitrarily varying channels (AVCs) with cost constraints. The code has time and space complexity polynomial in the blocklength n . It uses a Reed-Solomon outer code, logarithmic blocklength random inner codes, and stochastic encoding by permuting the codeword before transmission. When the channel satisfies a condition called strong DS-nonsymmetrizability (a modified version of nonsymmetrizability originally due to Dobrushin and Stambler), we show that the code achieves a rate that for a variety of oblivious AVCs (such as classically studied error/erasure channels) match the known capacities.},
  author       = {Dey, B. K. and Jaggi, S. and Langberg, M. and Sarwate, A. D. and Zhang, Yihan},
  booktitle    = {Proceedings of the 2024 IEEE International Symposium on Information Theory },
  isbn         = {9798350382846},
  issn         = {2157-8095},
  location     = {Athens, Greece},
  pages        = {1586--1591},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Computationally efficient codes for strongly Dobrushin-Stambler nonsymmetrizable oblivious AVCs}},
  doi          = {10.1109/ISIT57864.2024.10619362},
  year         = {2024},
}

@article{18652,
  abstract     = {Over the last 70 years, information theory and coding has enabled communication technologies that have had an astounding impact on our lives. This is possible due to the match between encoding/decoding strategies and corresponding channel models. Traditional studies of channels have taken one of two extremes: Shannon-theoretic models are inherently average-case in which channel noise is governed by a memoryless stochastic process, whereas coding-theoretic (referred to as “Hamming”) models take a worst-case, adversarial, view of the noise. However, for several existing and emerging communication systems the Shannon/average-case view may be too optimistic, whereas the Hamming/worstcase view may be too pessimistic. This monograph takes up the challenge of studying adversarial channel models that lie between the Shannon and Hamming extremes.},
  author       = {Dey, Bikash Kumar and Jaggi, Sidharth and Langberg, Michael and Sarwate, Anand D. and Zhang, Yihan},
  issn         = {1567-2328},
  journal      = {Foundations and Trends in Communications and Information Theory},
  number       = {3-4},
  pages        = {300--588},
  publisher    = {Now Publishers},
  title        = {{Codes for adversaries: Between worst-case and average-case jamming}},
  doi          = {10.1561/0100000112},
  volume       = {21},
  year         = {2024},
}