@inproceedings{14921,
  abstract     = {Neural collapse (NC) refers to the surprising structure of the last layer of deep neural networks in the terminal phase of gradient descent training. Recently, an increasing amount of experimental evidence has pointed to the propagation of NC to earlier layers of neural networks. However, while the NC in the last layer is well studied theoretically, much less is known about its multi-layered counterpart - deep neural collapse (DNC). In particular, existing work focuses either on linear layers or only on the last two layers at the price of an extra assumption. Our paper fills this gap by generalizing the established analytical framework for NC - the unconstrained features model - to multiple non-linear layers. Our key technical contribution is to show that, in a deep unconstrained features model, the unique global optimum for binary classification exhibits all the properties typical of DNC. This explains the existing experimental evidence of DNC. We also empirically show that (i) by optimizing deep unconstrained features models via gradient descent, the resulting solution agrees well with our theory, and (ii) trained networks recover the unconstrained features suitable for the occurrence of DNC, thus supporting the validity of this modeling principle.},
  author       = {Súkeník, Peter and Mondelli, Marco and Lampert, Christoph},
  booktitle    = {37th Annual Conference on Neural Information Processing Systems},
  location     = {New Orleans, LA, United States},
  title        = {{Deep neural collapse is provably optimal for the deep unconstrained features model}},
  year         = {2023},
}

@inproceedings{14922,
  abstract     = {We propose a novel approach to concentration for non-independent random variables. The main idea is to ``pretend'' that the random variables are independent and pay a multiplicative price measuring how far they are from actually being independent. This price is encapsulated in the Hellinger integral between the joint and the product of the marginals, which is then upper bounded leveraging tensorisation properties. Our bounds represent a natural generalisation of concentration inequalities in the presence of dependence: we recover exactly the classical bounds (McDiarmid's inequality) when the random variables are independent. Furthermore, in a ``large deviations'' regime, we obtain the same decay in the probability as for the independent case, even when the random variables display non-trivial dependencies. To show this, we consider a number of applications of interest. First, we provide a bound for Markov chains with finite state space. Then, we consider the Simple Symmetric Random Walk, which is a non-contracting Markov chain, and a non-Markovian setting in which the stochastic process depends on its entire past. To conclude, we propose an application to Markov Chain Monte Carlo methods, where our approach leads to an improved lower bound on the minimum burn-in period required to reach a certain accuracy. In all of these settings, we provide a regime of parameters in which our bound fares better than what the state of the art can provide.},
  author       = {Esposito, Amedeo Roberto and Mondelli, Marco},
  booktitle    = {Proceedings of 2023 IEEE International Symposium on Information Theory},
  issn         = {2157-8117},
  location     = {Taipei, Taiwan},
  pages        = {400--405},
  publisher    = {IEEE},
  title        = {{Concentration without independence via information measures}},
  doi          = {10.1109/isit54713.2023.10206899},
  year         = {2023},
}

@inproceedings{14923,
  abstract     = {We study the performance of a Bayesian statistician who estimates a rank-one signal corrupted by non-symmetric rotationally invariant noise with a generic distribution of singular values. As the signal-to-noise ratio and the noise structure are unknown, a Gaussian setup is incorrectly assumed. We derive the exact analytic expression for the error of the mismatched Bayes estimator and also provide the analysis of an approximate message passing (AMP) algorithm. The first result exploits the asymptotic behavior of spherical integrals for rectangular matrices and of low-rank matrix perturbations; the second one relies on the design and analysis of an auxiliary AMP. The numerical experiments show that there is a performance gap between the AMP and Bayes estimators, which is due to the incorrect estimation of the signal norm.},
  author       = {Fu, Teng and Liu, YuHao and Barbier, Jean and Mondelli, Marco and Liang, ShanSuo and Hou, TianQi},
  booktitle    = {Proceedings of 2023 IEEE International Symposium on Information Theory},
  isbn         = {9781665475549},
  issn         = {2157-8117},
  location     = {Taipei, Taiwan},
  pages        = {1178--1183},
  publisher    = {IEEE},
  title        = {{Mismatched estimation of non-symmetric rank-one matrices corrupted by structured noise}},
  doi          = {10.1109/isit54713.2023.10206671},
  year         = {2023},
}

@inproceedings{14924,
  abstract     = {The stochastic heavy ball method (SHB), also known as stochastic gradient descent (SGD) with Polyak's momentum, is widely used in training neural networks. However, despite the remarkable success of such algorithm in practice, its theoretical characterization remains limited. In this paper, we focus on neural networks with two and three layers and provide a rigorous understanding of the properties of the solutions found by SHB: \emph{(i)} stability after dropping out part of the neurons, \emph{(ii)} connectivity along a low-loss path, and \emph{(iii)} convergence to the global optimum.
To achieve this goal, we take a mean-field view and relate the SHB dynamics to a certain partial differential equation in the limit of large network widths. This mean-field perspective has inspired a recent line of work focusing on SGD while, in contrast, our paper considers an algorithm with momentum. More specifically, after proving existence and uniqueness of the limit differential equations, we show convergence to the global optimum and give a quantitative bound between the mean-field limit and the SHB dynamics of a finite-width network. Armed with this last bound, we are able to establish the dropout-stability and connectivity of SHB solutions.},
  author       = {Wu, Diyuan and Kungurtsev, Vyacheslav and Mondelli, Marco},
  booktitle    = {Transactions on Machine Learning Research},
  publisher    = {ML Research Press},
  title        = {{Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence}},
  year         = {2023},
}

@unpublished{14948,
  abstract     = {The extraction of modular object-centric representations for downstream tasks
is an emerging area of research. Learning grounded representations of objects
that are guaranteed to be stable and invariant promises robust performance
across different tasks and environments. Slot Attention (SA) learns
object-centric representations by assigning objects to \textit{slots}, but
presupposes a \textit{single} distribution from which all slots are randomly
initialised. This results in an inability to learn \textit{specialized} slots
which bind to specific object types and remain invariant to identity-preserving
changes in object appearance. To address this, we present
\emph{\textsc{Co}nditional \textsc{S}lot \textsc{A}ttention} (\textsc{CoSA})
using a novel concept of \emph{Grounded Slot Dictionary} (GSD) inspired by
vector quantization. Our proposed GSD comprises (i) canonical object-level
property vectors and (ii) parametric Gaussian distributions, which define a
prior over the slots. We demonstrate the benefits of our method in multiple
downstream tasks such as scene generation, composition, and task adaptation,
whilst remaining competitive with SA in popular object discovery benchmarks.},
  author       = {Kori, Avinash and Locatello, Francesco and Ribeiro, Fabio De Sousa and Toni, Francesca and Glocker, Ben},
  booktitle    = {arXiv},
  title        = {{Grounded object centric learning}},
  doi          = {10.48550/arXiv.2307.09437},
  year         = {2023},
}

@article{14949,
  abstract     = {Many approaches have been proposed to use diffusion models to augment training datasets for downstream tasks, such as classification. However, diffusion models are themselves trained on large datasets, often with noisy annotations, and it remains an open question to which extent these models contribute to downstream classification performance. In particular, it remains unclear if they generalize enough to improve over directly using the additional data of their pre-training process for augmentation. We systematically evaluate a range of existing methods to generate images from diffusion models and study new extensions to assess their benefit for data augmentation. Personalizing diffusion models towards the target data outperforms simpler prompting strategies. However, using the pre-training data of the diffusion model alone, via a simple nearest-neighbor retrieval procedure, leads to even stronger downstream performance. Our study explores the potential of diffusion models in generating new training data, and surprisingly finds that these sophisticated models are not yet able to beat a simple and strong image retrieval baseline on simple downstream vision tasks.},
  author       = {Burg, Max and Wenzel, Florian and Zietlow, Dominik and Horn, Max and Makansi, Osama and Locatello, Francesco and Russell, Chris},
  issn         = {2835-8856},
  journal      = {Journal of Machine Learning Research},
  publisher    = {ML Research Press},
  title        = {{Image retrieval outperforms diffusion models on data augmentation}},
  year         = {2023},
}

@unpublished{14952,
  abstract     = {While different neural models often exhibit latent spaces that are alike when exposed to semantically related data, this intrinsic similarity is not always immediately discernible. Towards a better understanding of this phenomenon, our work shows how representations learned from these neural modules can be translated between different pre-trained networks via simpler transformations than previously thought. An advantage of this approach is the ability to
estimate these transformations using standard, well-understood algebraic procedures that have closed-form solutions. Our method directly estimates a transformation between two given latent spaces, thereby enabling effective stitching of encoders and decoders without additional training. We extensively validate the adaptability of this translation procedure in different
experimental settings: across various trainings, domains, architectures (e.g., ResNet, CNN, ViT), and in multiple downstream tasks (classification, reconstruction). Notably, we show how it is possible to zero-shot stitch text encoders and vision decoders, or vice-versa, yielding surprisingly good classification performance in this multimodal setting.},
  author       = {Maiorca, Valentino and Moschella, Luca and Norelli, Antonio and Fumero, Marco and Locatello, Francesco and Rodolà, Emanuele},
  booktitle    = {arXiv},
  title        = {{Latent space translation via semantic alignment}},
  doi          = {10.48550/arXiv.2311.00664},
  year         = {2023},
}

@unpublished{14953,
  abstract     = {This paper provides statistical sample complexity bounds for score-matching and its applications in causal discovery. We demonstrate that accurate estimation of the score function is achievable by training a standard deep ReLU neural network using stochastic gradient descent. We establish bounds on the error rate of recovering causal relationships using the score-matching-based causal discovery method of Rolland et al. [2022], assuming a sufficiently good estimation of the score function. Finally, we analyze the upper bound of score-matching estimation within the score-based generative modeling, which has been applied for causal discovery but is also of independent interest within the domain of generative models.},
  author       = {Zhu, Zhenyu and Locatello, Francesco and Cevher, Volkan},
  booktitle    = {arXiv},
  title        = {{Sample complexity bounds for score-matching: Causal discovery and generative modeling}},
  doi          = {10.48550/arXiv.2310.18123},
  year         = {2023},
}

@unpublished{14954,
  abstract     = {When domain knowledge is limited and experimentation is restricted by ethical, financial, or time constraints, practitioners turn to observational causal discovery methods to recover the causal structure, exploiting the statistical properties of their data. Because causal discovery without further assumptions is an ill-posed problem, each algorithm comes with its own set of
usually untestable assumptions, some of which are hard to meet in real datasets. Motivated by these considerations, this paper extensively benchmarks the empirical performance of recent causal discovery methods on observational i.i.d. data generated under different background conditions, allowing for violations of the critical assumptions required by each selected approach. Our experimental findings show that score matching-based methods demonstrate
surprising performance in the false positive and false negative rate of the inferred graph in these challenging scenarios, and we provide theoretical insights into their performance. This work is also the first effort to benchmark the stability of causal discovery algorithms with respect to the values of their hyperparameters. Finally, we hope this paper will set a new standard for the evaluation of causal discovery methods and can serve as an accessible entry point for practitioners interested in the field, highlighting the empirical implications of different algorithm choices.},
  author       = {Montagna, Francesco and Mastakouri, Atalanti A. and Eulig, Elias and Noceti, Nicoletta and Rosasco, Lorenzo and Janzing, Dominik and Aragam, Bryon and Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Assumption violations in causal discovery and the robustness of score matching}},
  doi          = {10.48550/arXiv.2310.13387},
  year         = {2023},
}

@inproceedings{14958,
  abstract     = {Causal representation learning (CRL) aims at identifying high-level causal variables from low-level data, e.g. images. Current methods usually assume that all causal variables are captured in the high-dimensional observations. In this work, we focus on learning causal representations from data under partial observability, i.e., when some of the causal variables are not observed in the measurements, and the set of masked variables changes across the different samples. We introduce some initial theoretical results for identifying causal variables under partial observability by exploiting a sparsity regularizer, focusing in particular on the linear and piecewise linear mixing function case. We provide a theorem that allows us to identify the causal variables up to permutation and element-wise linear transformations in the linear case and a lemma that allows us to identify causal variables up to linear transformation in the piecewise case. Finally, we provide a conjecture that would allow us to identify the causal variables up to permutation and element-wise linear transformations also in the piecewise linear case. We test the theorem and conjecture on simulated data, showing the effectiveness of our method.},
  author       = {Xu, Danru and Yao, Dingling and Lachapelle, Sebastien and Taslakian, Perouz and von Kügelgen, Julius and Locatello, Francesco and Magliacane, Sara},
  booktitle    = {Causal Representation Learning Workshop at NeurIPS 2023},
  location     = {New Orleans, LA, United States},
  publisher    = {OpenReview},
  title        = {{A sparsity principle for partially observable causal representation learning}},
  year         = {2023},
}

@unpublished{14961,
  abstract     = {The use of simulated data in the field of causal discovery is ubiquitous due to the scarcity of annotated real data. Recently, Reisach et al., 2021 highlighted the emergence of patterns in simulated linear data, which displays increasing marginal variance in the casual direction. As an ablation in their experiments, Montagna et al., 2023 found that similar patterns may emerge in
nonlinear models for the variance of the score vector $\nabla \log p_{\mathbf{X}}$, and introduced the ScoreSort algorithm. In this work, we formally define and characterize this score-sortability pattern of nonlinear additive noise models. We find that it defines a class of identifiable (bivariate) causal models overlapping with nonlinear additive noise models. We
theoretically demonstrate the advantages of ScoreSort in terms of statistical efficiency compared to prior state-of-the-art score matching-based methods and empirically show the score-sortability of the most common synthetic benchmarks in the literature. Our findings remark (1) the lack of diversity in the data as an important limitation in the evaluation of nonlinear causal discovery approaches, (2) the importance of thoroughly testing different settings within a problem class, and (3) the importance of analyzing statistical properties in
causal discovery, where research is often limited to defining identifiability conditions of the model. },
  author       = {Montagna, Francesco and Noceti, Nicoletta and Rosasco, Lorenzo and Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Shortcuts for causal discovery of nonlinear models by score matching}},
  doi          = {10.48550/arXiv.2310.14246},
  year         = {2023},
}

@unpublished{14962,
  abstract     = {In this paper, we show that recent advances in video representation learning
and pre-trained vision-language models allow for substantial improvements in
self-supervised video object localization. We propose a method that first
localizes objects in videos via a slot attention approach and then assigns text
to the obtained slots. The latter is achieved by an unsupervised way to read
localized semantic information from the pre-trained CLIP model. The resulting
video object localization is entirely unsupervised apart from the implicit
annotation contained in CLIP, and it is effectively the first unsupervised
approach that yields good results on regular video benchmarks.},
  author       = {Fan, Ke and Bai, Zechen and Xiao, Tianjun and Zietlow, Dominik and Horn, Max and Zhao, Zixu and Carl-Johann Simon-Gabriel, Carl-Johann Simon-Gabriel and Shou, Mike Zheng and Locatello, Francesco and Schiele, Bernt and Brox, Thomas and Zhang, Zheng and Fu, Yanwei and He, Tong},
  booktitle    = {arXiv},
  title        = {{Unsupervised open-vocabulary object localization in videos}},
  doi          = {10.48550/arXiv.2309.09858},
  year         = {2023},
}

@unpublished{14963,
  abstract     = {Unsupervised object-centric learning methods allow the partitioning of scenes
into entities without additional localization information and are excellent
candidates for reducing the annotation burden of multiple-object tracking (MOT)
pipelines. Unfortunately, they lack two key properties: objects are often split
into parts and are not consistently tracked over time. In fact,
state-of-the-art models achieve pixel-level accuracy and temporal consistency
by relying on supervised object detection with additional ID labels for the
association through time. This paper proposes a video object-centric model for
MOT. It consists of an index-merge module that adapts the object-centric slots
into detection outputs and an object memory module that builds complete object
prototypes to handle occlusions. Benefited from object-centric learning, we
only require sparse detection labels (0%-6.25%) for object localization and
feature binding. Relying on our self-supervised
Expectation-Maximization-inspired loss for object association, our approach
requires no ID labels. Our experiments significantly narrow the gap between the
existing object-centric model and the fully supervised state-of-the-art and
outperform several unsupervised trackers.},
  author       = {Zhao, Zixu and Wang, Jiaze and Horn, Max and Ding, Yizhuo and He, Tong and Bai, Zechen and Zietlow, Dominik and Carl-Johann Simon-Gabriel, Carl-Johann Simon-Gabriel and Shuai, Bing and Tu, Zhuowen and Brox, Thomas and Schiele, Bernt and Fu, Yanwei and Locatello, Francesco and Zhang, Zheng and Xiao, Tianjun},
  booktitle    = {arXiv},
  title        = {{Object-centric multiple object tracking}},
  doi          = {10.48550/arXiv.2309.00233},
  year         = {2023},
}

@misc{14965,
  abstract     = {A method of determining a correspondence between a first biological property of a cell and one or more further biological properties of cells is provided. The first biological property and the further biological properties are determined by different analysis techniques and each are contained in a respective one of a plurality of sets of biological properties. The method includes the steps of: converting the plurality of sets of biological properties into corresponding representations in a representation format which is invariant to the technologies used to derive the biological properties; determining, in said representation format, a representation from each of the converted sets of further biological properties which most closely matches the first representation of the first biological property; and re-converting the determined representations from the representation format back to the biological properties associated with the determined representations and thereby determining a correspondence between the first biological property and each of the further biological properties.},
  author       = {Ficek, Joanna and Lehmann, Kjong-Van and Locatello, Francesco and Raetsch, Gunnar  and Stark, Stefan},
  pages        = {9},
  title        = {{Methods of determining correspondences between biological properties of cells}},
  year         = {2023},
}

@inproceedings{14974,
  abstract     = {The field of machine learning and AI has witnessed remarkable breakthroughs with the emergence of LLMs, which have also sparked a lively debate in the causal community. As researchers in this field, we are interested in exploring how LLMs relate to causality research, and how we can leverage the technology to advance it. In the second conference of Causal Learning and Reasoning (CLeaR), 2023, we held a round table discussion to gather and integrate the diverse perspectives of the CLeaR community on this topic.
There is a general consensus that LLMs are not yet capable of causal reasoning at the current
stage but has a lot of potential with public available information by CLeaR 2023. Enhancing causal machine learning is vital not only for its own sake but also to help LLMs improve their performance, especially regarding trustworthiness. In this document, we present both the summary and the raw outcome of the round table discussion. We acknowledge that with the progress of both fields, the opportunities and impact may rapidly change. We will repeat the same exercise in CLeaR 2024 to document the evolution.},
  author       = {Zhang, Cheng and Janzing, Dominik and van der Schaar, Mihaela  and Locatello, Francesco and Spirtes, Peter and Zhang, Kun and Schölkopf, Bernhard and Uhler, Caroline},
  booktitle    = {2nd Conference on Causal Learning and Reasoning},
  location     = {Tübingen, Germany},
  title        = {{Causality in the time of LLMs: Round table discussion results of CLeaR 2023}},
  year         = {2023},
}

@article{14985,
  abstract     = {Lead sulfide (PbS) presents large potential in thermoelectric application due to its earth-abundant S element. However, its inferior average ZT (ZTave) value makes PbS less competitive with its analogs PbTe and PbSe. To promote its thermoelectric performance, this study implements strategies of continuous Se alloying and Cu interstitial doping to synergistically tune thermal and electrical transport properties in n-type PbS. First, the lattice parameter of 5.93 Å in PbS is linearly expanded to 6.03 Å in PbS0.5Se0.5 with increasing Se alloying content. This expanded lattice in Se-alloyed PbS not only intensifies phonon scattering but also facilitates the formation of Cu interstitials. Based on the PbS0.6Se0.4 content with the minimal lattice thermal conductivity, Cu interstitials are introduced to improve the electron density, thus boosting the peak power factor, from 3.88 μW cm−1 K−2 in PbS0.6Se0.4 to 20.58 μW cm−1 K−2 in PbS0.6Se0.4−1%Cu. Meanwhile, the lattice thermal conductivity in PbS0.6Se0.4−x%Cu (x = 0–2) is further suppressed due to the strong strain field caused by Cu interstitials. Finally, with the lowered thermal conductivity and high electrical transport properties, a peak ZT ~1.1 and ZTave ~0.82 can be achieved in PbS0.6Se0.4 − 1%Cu at 300–773K, which outperforms previously reported n-type PbS.},
  author       = {Liu, Zhengtao and Hong, Tao and Xu, Liqing and Wang, Sining and Gao, Xiang and Chang, Cheng and Ding, Xiangdong and Xiao, Yu and Zhao, Li‐Dong},
  issn         = {2767-441X},
  journal      = {Interdisciplinary Materials},
  number       = {1},
  pages        = {161--170},
  publisher    = {Wiley},
  title        = {{Lattice expansion enables interstitial doping to achieve a high average ZT in n‐type PbS}},
  doi          = {10.1002/idm2.12056},
  volume       = {2},
  year         = {2023},
}

@inproceedings{14989,
  abstract     = {Encryption alone is not enough for secure end-to end encrypted messaging: a server must also honestly serve public keys to users. Key transparency has been presented as an efficient
solution for detecting (and hence deterring) a server that attempts to dishonestly serve keys. Key transparency involves two major components: (1) a username to public key mapping, stored and cryptographically committed to by the server, and, (2) an outof-band consistency protocol for serving short commitments to users. In the setting of real-world deployments and supporting production scale, new challenges must be considered for both of these components. We enumerate these challenges and provide solutions to address them. In particular, we design and implement a memory-optimized and privacy-preserving verifiable data structure for committing to the username to public key store.
To make this implementation viable for production, we also integrate support for persistent and distributed storage. We also propose a future-facing solution, termed “compaction”, as
a mechanism for mitigating practical issues that arise from dealing with infinitely growing server data structures. Finally, we implement a consensusless solution that achieves the minimum requirements for a service that consistently distributes commitments for a transparency application, providing a much more efficient protocol for distributing small and consistent
commitments to users. This culminates in our production-grade implementation of a key transparency system (Parakeet) which we have open-sourced, along with a demonstration of feasibility through our benchmarks.},
  author       = {Malvai, Harjasleen and Kokoris Kogias, Eleftherios and Sonnino, Alberto and Ghosh, Esha and Oztürk, Ercan and Lewi, Kevin and Lawlor, Sean},
  booktitle    = {Proceedings of the 2023 Network and Distributed System Security Symposium},
  isbn         = {1891562835},
  location     = {San Diego, CA, United States},
  publisher    = {Internet Society},
  title        = {{Parakeet: Practical key transparency for end-to-end eEncrypted messaging}},
  doi          = {10.14722/ndss.2023.24545},
  year         = {2023},
}

@misc{14990,
  abstract     = {The software artefact to evaluate the approximation of stationary distributions implementation.},
  author       = {Meggendorfer, Tobias},
  publisher    = {Zenodo},
  title        = {{Artefact for: Correct Approximation of Stationary Distributions}},
  doi          = {10.5281/ZENODO.7548214},
  year         = {2023},
}

@misc{14991,
  abstract     = {This repository contains the data, scripts, WRF codes and files required to reproduce the results of the manuscript "Assessing Memory in Convection Schemes Using Idealized Tests" submitted to the Journal of Advances in Modeling Earth Systems (JAMES).},
  author       = {Hwong, Yi-Ling and Colin, Maxime and Aglas, Philipp and Muller, Caroline J and Sherwood, Steven C.},
  publisher    = {Zenodo},
  title        = {{Data-assessing memory in convection schemes using idealized tests}},
  doi          = {10.5281/ZENODO.7757041},
  year         = {2023},
}

@inbook{14992,
  abstract     = {In this chapter we first review the Levy–Lieb functional, which gives the lowest kinetic and interaction energy that can be reached with all possible quantum states having a given density. We discuss two possible convex generalizations of this functional, corresponding to using mixed canonical and grand-canonical states, respectively. We present some recent works about the local density approximation, in which the functionals get replaced by purely local functionals constructed using the uniform electron gas energy per unit volume. We then review the known upper and lower bounds on the Levy–Lieb functionals. We start with the kinetic energy alone, then turn to the classical interaction alone, before we are able to put everything together. A later section is devoted to the Hohenberg–Kohn theorem and the role of many-body unique continuation in its proof.},
  author       = {Lewin, Mathieu and Lieb, Elliott H. and Seiringer, Robert},
  booktitle    = {Density Functional Theory},
  editor       = {Cances, Eric and Friesecke, Gero},
  isbn         = {9783031223396},
  issn         = {3005-0286},
  pages        = {115--182},
  publisher    = {Springer},
  title        = {{Universal Functionals in Density Functional Theory}},
  doi          = {10.1007/978-3-031-22340-2_3},
  year         = {2023},
}

