@inproceedings{14171,
  abstract     = {This paper demonstrates how to recover causal graphs from the score of the
data distribution in non-linear additive (Gaussian) noise models. Using score
matching algorithms as a building block, we show how to design a new generation
of scalable causal discovery methods. To showcase our approach, we also propose
a new efficient method for approximating the score's Jacobian, enabling to
recover the causal graph. Empirically, we find that the new algorithm, called
SCORE, is competitive with state-of-the-art causal discovery methods while
being significantly faster.},
  author       = {Rolland, Paul and Cevher, Volkan and Kleindessner, Matthäus and Russel, Chris and Schölkopf, Bernhard and Janzing, Dominik and Locatello, Francesco},
  booktitle    = {Proceedings of the 39th International Conference on Machine Learning},
  location     = {Baltimore, MD, United States},
  pages        = {18741--18753},
  publisher    = {ML Research Press},
  title        = {{Score matching enables causal discovery of nonlinear additive noise  models}},
  volume       = {162},
  year         = {2022},
}

@inproceedings{14172,
  abstract     = {An important component for generalization in machine learning is to uncover underlying latent factors of variation as well as the mechanism through which each factor acts in the world. In this paper, we test whether 17 unsupervised, weakly supervised, and fully supervised representation learning approaches correctly infer the generative factors of variation in simple datasets (dSprites, Shapes3D, MPI3D) from controlled environments, and on our contributed CelebGlow dataset. In contrast to prior robustness work that introduces novel factors of variation during test time, such as blur or other (un)structured noise, we here recompose, interpolate, or extrapolate only existing factors of variation from the training data set (e.g., small and medium-sized objects during training and large objects during testing). Models
that learn the correct mechanism should be able to generalize to this benchmark. In total, we train and test 2000+ models and observe that all of them struggle to learn the underlying mechanism regardless of supervision signal and architectural bias. Moreover, the generalization capabilities of all tested models drop significantly as we move from artificial datasets towards
more realistic real-world datasets. Despite their inability to identify the correct mechanism, the models are quite modular as their ability to infer other in-distribution factors remains fairly stable, providing only a single factoris out-of-distribution. These results point to an important yet understudied problem of learning mechanistic models of observations that can facilitate
generalization.},
  author       = {Schott, Lukas and Kügelgen, Julius von and Träuble, Frederik and Gehler, Peter and Russell, Chris and Bethge, Matthias and Schölkopf, Bernhard and Locatello, Francesco and Brendel, Wieland},
  booktitle    = {10th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{Visual representation learning does not generalize strongly within the  same domain}},
  year         = {2022},
}

@inproceedings{14173,
  abstract     = {Since out-of-distribution generalization is a generally ill-posed problem, various proxy targets (e.g., calibration, adversarial robustness, algorithmic corruptions, invariance across shifts) were studied across different research programs resulting in different recommendations. While sharing the same aspirational goal, these approaches have never been tested under the same
experimental conditions on real data. In this paper, we take a unified view of previous work, highlighting message discrepancies that we address empirically, and providing recommendations on how to measure the robustness of a model and how to improve it. To this end, we collect 172 publicly available dataset pairs for training and out-of-distribution evaluation of accuracy, calibration error, adversarial attacks, environment invariance, and synthetic corruptions. We fine-tune over 31k networks, from nine different architectures in the many- and
few-shot setting. Our findings confirm that in- and out-of-distribution accuracies tend to increase jointly, but show that their relation is largely dataset-dependent, and in general more nuanced and more complex than posited by previous, smaller scale studies.},
  author       = {Wenzel, Florian and Dittadi, Andrea and Gehler, Peter Vincent and Carl-Johann Simon-Gabriel, Carl-Johann Simon-Gabriel and Horn, Max and Zietlow, Dominik and Kernert, David and Russell, Chris and Brox, Thomas and Schiele, Bernt and Schölkopf, Bernhard and Locatello, Francesco},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  isbn         = {9781713871088},
  location     = {New Orleans, LA, United States},
  pages        = {7181--7198},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Assaying out-of-distribution generalization in transfer learning}},
  volume       = {35},
  year         = {2022},
}

@inproceedings{14174,
  abstract     = {Building sample-efficient agents that generalize out-of-distribution (OOD) in real-world settings remains a fundamental unsolved problem on the path towards achieving higher-level cognition. One particularly promising approach is to begin with low-dimensional, pretrained representations of our world, which should facilitate efficient downstream learning and generalization. By training 240 representations and over 10,000 reinforcement learning (RL) policies on a simulated robotic setup, we evaluate to what extent different properties of
pretrained VAE-based representations affect the OOD generalization of downstream agents. We observe that many agents are surprisingly robust to realistic distribution shifts, including the challenging sim-to-real case. In addition, we find that the generalization performance of a simple downstream proxy task reliably predicts the generalization performance of our RL agents
under a wide range of OOD settings. Such proxy tasks can thus be used to select pretrained representations that will lead to agents that generalize.},
  author       = {Dittadi, Andrea and Träuble, Frederik and Wüthrich, Manuel and Widmaier, Felix and Gehler, Peter and Winther, Ole and Locatello, Francesco and Bachem, Olivier and Schölkopf, Bernhard and Bauer, Stefan},
  booktitle    = {10th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{The role of pretrained representations for the OOD generalization of  reinforcement learning agents}},
  year         = {2022},
}

@inproceedings{14175,
  abstract     = {Predicting the future trajectory of a moving agent can be easy when the past trajectory continues smoothly but is challenging when complex interactions with other agents are involved. Recent deep learning approaches for trajectory prediction show promising performance and partially attribute this to successful reasoning about agent-agent interactions. However, it remains unclear which features such black-box models actually learn to use for making predictions. This paper proposes a procedure that quantifies the contributions
of different cues to model performance based on a variant of Shapley values. Applying this procedure to state-of-the-art trajectory prediction methods on standard benchmark datasets shows that they are, in fact, unable to reason about interactions. Instead, the past trajectory of the target is the only feature used for predicting its future. For a task with richer social
interaction patterns, on the other hand, the tested models do pick up such interactions to a certain extent, as quantified by our feature attribution method. We discuss the limits of the proposed method and its links to causality.},
  author       = {Makansi, Osama and Kügelgen, Julius von and Locatello, Francesco and Gehler, Peter and Janzing, Dominik and Brox, Thomas and Schölkopf, Bernhard},
  booktitle    = {10th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{You mostly walk alone: Analyzing feature attribution in trajectory prediction}},
  year         = {2022},
}

@inproceedings{14215,
  abstract     = {Geospatial Information Systems are used by researchers and Humanitarian Assistance and Disaster Response (HADR) practitioners to support a wide variety of important applications. However, collaboration between these actors is difficult due to the heterogeneous nature of geospatial data modalities (e.g., multi-spectral images of various resolutions, timeseries, weather data) and diversity of tasks (e.g., regression of human activity indicators or detecting forest fires). In this work, we present a roadmap towards the construction of a general-purpose neural architecture (GPNA) with a geospatial inductive bias, pre-trained on large amounts of unlabelled earth observation data in a self-supervised manner. We envision how such a model may facilitate cooperation between members of the community. We show preliminary results on the first step of the roadmap, where we instantiate an architecture that can process a wide variety of geospatial data modalities and demonstrate that it can achieve competitive performance with domain-specific architectures on tasks relating to the U.N.'s Sustainable Development Goals.},
  author       = {Rahaman, Nasim and Weiss, Martin and Träuble, Frederik and Locatello, Francesco and Lacoste, Alexandre and Bengio, Yoshua and Pal, Chris and Li, Li Erran and Schölkopf, Bernhard},
  booktitle    = {36th Conference on Neural Information Processing Systems},
  location     = {New Orleans, LA, United States},
  title        = {{A general purpose neural architecture for geospatial systems}},
  year         = {2022},
}

@unpublished{14220,
  abstract     = {Although reinforcement learning has seen remarkable progress over the last years, solving robust dexterous object-manipulation tasks in multi-object settings remains a challenge. In this paper, we focus on models that can learn manipulation tasks in fixed multi-object settings and extrapolate this skill zero-shot without any drop in performance when the number of objects changes. We consider the generic task of bringing a specific cube out of a set to a goal position. We find that previous approaches, which primarily leverage attention and graph neural network-based architectures, do not generalize their skills when the number of input objects changes while scaling as K2. We propose an alternative plug-and-play module based on relational inductive biases to overcome these limitations. Besides exceeding performances in their training environment, we show that our approach, which scales linearly in K, allows agents to extrapolate and generalize zero-shot to any new object number.},
  author       = {Mambelli, Davide and Träuble, Frederik and Bauer, Stefan and Schölkopf, Bernhard and Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Compositional multi-object reinforcement learning with linear relation networks}},
  doi          = {10.48550/arXiv.2201.13388},
  year         = {2022},
}

@article{14117,
  abstract     = {The two fields of machine learning and graphical causality arose and are developed separately. However, there is, now, cross-pollination and increasing interest in both fields to benefit from the advances of the other. In this article, we review fundamental concepts of causal inference and relate them to crucial open problems of machine learning, including transfer and generalization, thereby assaying how causality can contribute to modern machine learning research. This also applies in the opposite direction: we note that most work in causality starts from the premise that the causal variables are given. A central problem for AI and causality is, thus, causal representation learning, that is, the discovery of high-level causal variables from low-level observations. Finally, we delineate some implications of causality for machine learning and propose key research areas at the intersection of both communities.},
  author       = {Scholkopf, Bernhard and Locatello, Francesco and Bauer, Stefan and Ke, Nan Rosemary and Kalchbrenner, Nal and Goyal, Anirudh and Bengio, Yoshua},
  issn         = {1558-2256},
  journal      = {Proceedings of the IEEE},
  keywords     = {Electrical and Electronic Engineering},
  number       = {5},
  pages        = {612--634},
  publisher    = {Institute of Electrical and Electronics Engineers},
  title        = {{Toward causal representation learning}},
  doi          = {10.1109/jproc.2021.3058954},
  volume       = {109},
  year         = {2021},
}

@inproceedings{14176,
  abstract     = {Intensive care units (ICU) are increasingly looking towards machine learning for methods to provide online monitoring of critically ill patients. In machine learning, online monitoring is often formulated as a supervised learning problem. Recently, contrastive learning approaches have demonstrated promising improvements over competitive supervised benchmarks. These methods rely on well-understood data augmentation techniques developed for image data which do not apply to online monitoring. In this work, we overcome this limitation by
supplementing time-series data augmentation techniques with a novel contrastive
learning objective which we call neighborhood contrastive learning (NCL). Our objective explicitly groups together contiguous time segments from each patient while maintaining state-specific information. Our experiments demonstrate a marked improvement over existing work applying contrastive methods to medical time-series.},
  author       = {Yèche, Hugo and Dresdner, Gideon and Locatello, Francesco and Hüser, Matthias and Rätsch, Gunnar},
  booktitle    = {Proceedings of 38th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {11964--11974},
  publisher    = {ML Research Press},
  title        = {{Neighborhood contrastive learning applied to online patient monitoring}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{14177,
  abstract     = {The focus of disentanglement approaches has been on identifying independent factors of variation in data. However, the causal variables underlying real-world observations are often not statistically independent. In this work, we bridge the gap to real-world scenarios by analyzing the behavior of the most prominent disentanglement approaches on correlated data in a large-scale empirical study (including 4260 models). We show and quantify that systematically induced correlations in the dataset are being learned and reflected in the latent representations, which has implications for downstream applications of disentanglement such as fairness. We also demonstrate how to resolve these latent correlations, either using weak supervision during
training or by post-hoc correcting a pre-trained model with a small number of labels.},
  author       = {Träuble, Frederik and Creager, Elliot and Kilbertus, Niki and Locatello, Francesco and Dittadi, Andrea and Goyal, Anirudh and Schölkopf, Bernhard and Bauer, Stefan},
  booktitle    = {Proceedings of the 38th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {10401--10412},
  publisher    = {ML Research Press},
  title        = {{On disentangled representations learned from correlated data}},
  volume       = {139},
  year         = {2021},
}

@inproceedings{14178,
  abstract     = {Learning meaningful representations that disentangle the underlying structure of the data generating process is considered to be of key importance in machine learning. While disentangled representations were found to be useful for diverse tasks such as abstract reasoning and fair classification, their scalability and real-world impact remain questionable. We introduce a new high-resolution dataset with 1M simulated images and over 1,800 annotated real-world images of the same setup. In contrast to previous work, this new dataset exhibits correlations, a complex underlying structure, and allows to evaluate transfer to unseen simulated and real-world settings where the encoder i) remains in distribution or ii) is out of distribution. We propose new architectures in order to scale disentangled representation learning to realistic high-resolution settings and conduct a large-scale empirical study of disentangled representations on this dataset. We observe that disentanglement is a good predictor for out-of-distribution (OOD) task performance.},
  author       = {Dittadi, Andrea and Träuble, Frederik and Locatello, Francesco and Wüthrich, Manuel and Agrawal, Vaibhav and Winther, Ole and Bauer, Stefan and Schölkopf, Bernhard},
  booktitle    = {The Ninth International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{On the transfer of disentangled representations in realistic settings}},
  year         = {2021},
}

@inproceedings{14179,
  abstract     = {Self-supervised representation learning has shown remarkable success in a number of domains. A common practice is to perform data augmentation via hand-crafted transformations intended to leave the semantics of the data invariant. We seek to understand the empirical success of this approach from a theoretical perspective. We formulate the augmentation process as a latent variable model by postulating a partition of the latent representation into a content component, which is assumed invariant to augmentation, and a style component, which is allowed to change. Unlike prior work on disentanglement and independent component analysis, we allow for both nontrivial statistical and causal dependencies in the latent space. We study the identifiability of the latent representation based on pairs of views of the observations and prove sufficient conditions that allow us to identify the invariant content partition up to an invertible mapping in both generative and discriminative settings. We find numerical simulations with dependent latent variables are consistent with our theory. Lastly, we introduce Causal3DIdent, a dataset of high-dimensional, visually complex images with rich causal dependencies, which we use to study the effect of data augmentations performed in practice.},
  author       = {Kügelgen, Julius von and Sharma, Yash and Gresele, Luigi and Brendel, Wieland and Schölkopf, Bernhard and Besserve, Michel and Locatello, Francesco},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {16451--16467},
  title        = {{Self-supervised learning with data augmentations provably isolates content from style}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{14180,
  abstract     = {Modern neural network architectures can leverage large amounts of data to generalize well within the training distribution. However, they are less capable of systematic generalization to data drawn from unseen but related distributions, a feat that is hypothesized to require compositional reasoning and reuse of knowledge. In this work, we present Neural Interpreters, an architecture that factorizes inference in a self-attention network as a system of modules, which we call \emph{functions}. Inputs to the model are routed through a sequence of functions in a way that is end-to-end learned. The proposed architecture can flexibly compose computation along width and depth, and lends itself well to capacity extension after training. To demonstrate the versatility of Neural Interpreters, we evaluate it in two distinct settings: image classification and visual abstract reasoning on Raven Progressive Matrices. In the former, we show that Neural Interpreters perform on par with the vision transformer using fewer parameters, while being transferrable to a new task in a sample efficient manner. In the latter, we find that Neural Interpreters are competitive with respect to the state-of-the-art in terms of systematic generalization. },
  author       = {Rahaman, Nasim and Gondal, Muhammad Waleed and Joshi, Shruti and Gehler, Peter and Bengio, Yoshua and Locatello, Francesco and Schölkopf, Bernhard},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {10985--10998},
  title        = {{Dynamic inference with neural interpreters}},
  volume       = {34},
  year         = {2021},
}

@inproceedings{14181,
  abstract     = {Variational Inference makes a trade-off between the capacity of the variational family and the tractability of finding an approximate posterior distribution. Instead, Boosting Variational Inference allows practitioners to obtain increasingly good posterior approximations by spending more compute. The main obstacle to widespread adoption of Boosting Variational Inference is the amount of resources necessary to improve over a strong Variational Inference baseline. In our work, we trace this limitation back to the global curvature of the KL-divergence. We characterize how the global curvature impacts time and memory consumption, address the problem with the notion of local curvature, and provide a novel approximate backtracking algorithm for estimating local curvature. We give new theoretical convergence rates for our algorithms and provide experimental validation on synthetic and real-world datasets.},
  author       = {Dresdner, Gideon and Shekhar, Saurav and Pedregosa, Fabian and Locatello, Francesco and Rätsch, Gunnar},
  booktitle    = {Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence},
  location     = {Montreal, Canada},
  pages        = {2337--2343},
  publisher    = {International Joint Conferences on Artificial Intelligence},
  title        = {{Boosting variational inference with locally adaptive step-sizes}},
  doi          = {10.24963/ijcai.2021/322},
  year         = {2021},
}

@inproceedings{14182,
  abstract     = {When machine learning systems meet real world applications, accuracy is only
one of several requirements. In this paper, we assay a complementary
perspective originating from the increasing availability of pre-trained and
regularly improving state-of-the-art models. While new improved models develop
at a fast pace, downstream tasks vary more slowly or stay constant. Assume that
we have a large unlabelled data set for which we want to maintain accurate
predictions. Whenever a new and presumably better ML models becomes available,
we encounter two problems: (i) given a limited budget, which data points should
be re-evaluated using the new model?; and (ii) if the new predictions differ
from the current ones, should we update? Problem (i) is about compute cost,
which matters for very large data sets and models. Problem (ii) is about
maintaining consistency of the predictions, which can be highly relevant for
downstream applications; our demand is to avoid negative flips, i.e., changing
correct to incorrect predictions. In this paper, we formalize the Prediction
Update Problem and present an efficient probabilistic approach as answer to the
above questions. In extensive experiments on standard classification benchmark
data sets, we show that our method outperforms alternative strategies along key
metrics for backward-compatible prediction updates.},
  author       = {Träuble, Frederik and Kügelgen, Julius von and Kleindessner, Matthäus and Locatello, Francesco and Schölkopf, Bernhard and Gehler, Peter},
  booktitle    = {35th Conference on Neural Information Processing Systems},
  isbn         = {9781713845393},
  location     = {Virtual},
  pages        = {116--128},
  title        = {{Backward-compatible prediction updates: A probabilistic approach}},
  volume       = {34},
  year         = {2021},
}

@misc{14185,
  abstract     = {A method involves receiving a perceptual representation including a plurality of feature vectors, and initializing a plurality of slot vectors represented by a neural network memory unit. Each respective slot vector is configured to represent a corresponding entity in the perceptual representation. The method also involves determining an attention matrix based on a product of the plurality of feature vectors transformed by a key function and the plurality of slot vectors transformed by a query function. Each respective value of a plurality of values along each respective dimension of the attention matrix is normalized with respect to the plurality of values. The method additionally involves determining an update matrix based on the plurality of feature vectors transformed by a value function and the attention matrix, and updating the plurality of slot vectors based on the update matrix by way of the neural network memory unit.},
  author       = {Weissenborn, Dirk and Uszkoreit, Jakob and Unterthiner, Thomas and Mahendran, Aravindh and Locatello, Francesco and Kipf, Thomas and Heigold, Georg and Dosovitskiy, Alexey},
  title        = {{Object-centric learning with slot attention}},
  year         = {2021},
}

@unpublished{14221,
  abstract     = {The world is structured in countless ways. It may be prudent to enforce corresponding structural properties to a learning algorithm's solution, such as incorporating prior beliefs, natural constraints, or causal structures. Doing so may translate to faster, more accurate, and more flexible models, which may directly relate to real-world impact. In this dissertation, we consider two different research areas that concern structuring a learning algorithm's solution: when the structure is known and when it has to be discovered.},
  author       = {Locatello, Francesco},
  booktitle    = {arXiv},
  title        = {{Enforcing and discovering structure in machine learning}},
  doi          = {10.48550/arXiv.2111.13693},
  year         = {2021},
}

@inproceedings{14332,
  abstract     = {Learning data representations that are useful for various downstream tasks is a cornerstone of artificial intelligence. While existing methods are typically evaluated on downstream tasks such as classification or generative image quality, we propose to assess representations through their usefulness in downstream control tasks, such as reaching or pushing objects. By training over 10,000 reinforcement learning policies, we extensively evaluate to what extent different representation properties affect out-of-distribution (OOD) generalization. Finally, we demonstrate zero-shot transfer of these policies from simulation to the real world, without any domain randomization or fine-tuning. This paper aims to establish the first systematic characterization of the usefulness of learned representations for real-world OOD downstream tasks.},
  author       = {Träuble, Frederik and Dittadi, Andrea and Wuthrich, Manuel and Widmaier, Felix and Gehler, Peter Vincent and Winther, Ole and Locatello, Francesco and Bachem, Olivier and Schölkopf, Bernhard and Bauer, Stefan},
  booktitle    = {ICML 2021 Workshop on Unsupervised Reinforcement Learning},
  location     = {Virtual},
  title        = {{Representation learning for out-of-distribution generalization in reinforcement learning}},
  year         = {2021},
}

@article{14125,
  abstract     = {Motivation: Recent technological advances have led to an increase in the production and availability of single-cell data. The ability to integrate a set of multi-technology measurements would allow the identification of biologically or clinically meaningful observations through the unification of the perspectives afforded by each technology. In most cases, however, profiling technologies consume the used cells and thus pairwise correspondences between datasets are lost. Due to the sheer size single-cell datasets can acquire, scalable algorithms that are able to universally match single-cell measurements carried out in one cell to its corresponding sibling in another technology are needed.
Results: We propose Single-Cell data Integration via Matching (SCIM), a scalable approach to recover such correspondences in two or more technologies. SCIM assumes that cells share a common (low-dimensional) underlying structure and that the underlying cell distribution is approximately constant across technologies. It constructs a technology-invariant latent space using an autoencoder framework with an adversarial objective. Multi-modal datasets are integrated by pairing cells across technologies using a bipartite matching scheme that operates on the low-dimensional latent representations. We evaluate SCIM on a simulated cellular branching process and show that the cell-to-cell matches derived by SCIM reflect the same pseudotime on the simulated dataset. Moreover, we apply our method to two real-world scenarios, a melanoma tumor sample and a human bone marrow sample, where we pair cells from a scRNA dataset to their sibling cells in a CyTOF dataset achieving 90% and 78% cell-matching accuracy for each one of the samples, respectively.},
  author       = {Stark, Stefan G and Ficek, Joanna and Locatello, Francesco and Bonilla, Ximena and Chevrier, Stéphane and Singer, Franziska and Aebersold, Rudolf and Al-Quaddoomi, Faisal S and Albinus, Jonas and Alborelli, Ilaria and Andani, Sonali and Attinger, Per-Olof and Bacac, Marina and Baumhoer, Daniel and Beck-Schimmer, Beatrice and Beerenwinkel, Niko and Beisel, Christian and Bernasconi, Lara and Bertolini, Anne and Bodenmiller, Bernd and Bonilla, Ximena and Casanova, Ruben and Chevrier, Stéphane and Chicherova, Natalia and D'Costa, Maya and Danenberg, Esther and Davidson, Natalie and gan, Monica-Andreea Dră and Dummer, Reinhard and Engler, Stefanie and Erkens, Martin and Eschbach, Katja and Esposito, Cinzia and Fedier, André and Ferreira, Pedro and Ficek, Joanna and Frei, Anja L and Frey, Bruno and Goetze, Sandra and Grob, Linda and Gut, Gabriele and Günther, Detlef and Haberecker, Martina and Haeuptle, Pirmin and Heinzelmann-Schwarz, Viola and Herter, Sylvia and Holtackers, Rene and Huesser, Tamara and Irmisch, Anja and Jacob, Francis and Jacobs, Andrea and Jaeger, Tim M and Jahn, Katharina and James, Alva R and Jermann, Philip M and Kahles, André and Kahraman, Abdullah and Koelzer, Viktor H and Kuebler, Werner and Kuipers, Jack and Kunze, Christian P and Kurzeder, Christian and Lehmann, Kjong-Van and Levesque, Mitchell and Lugert, Sebastian and Maass, Gerd and Manz, Markus and Markolin, Philipp and Mena, Julien and Menzel, Ulrike and Metzler, Julian M and Miglino, Nicola and Milani, Emanuela S and Moch, Holger and Muenst, Simone and Murri, Riccardo and Ng, Charlotte KY and Nicolet, Stefan and Nowak, Marta and Pedrioli, Patrick GA and Pelkmans, Lucas and Piscuoglio, Salvatore and Prummer, Michael and Ritter, Mathilde and Rommel, Christian and Rosano-González, María L and Rätsch, Gunnar and Santacroce, Natascha and Castillo, Jacobo Sarabia del and Schlenker, Ramona and Schwalie, Petra C and Schwan, Severin and Schär, Tobias and Senti, Gabriela and Singer, Franziska and Sivapatham, Sujana and Snijder, Berend and Sobottka, Bettina and Sreedharan, Vipin T and Stark, Stefan and Stekhoven, Daniel J and Theocharides, Alexandre PA and Thomas, Tinu M and Tolnay, Markus and Tosevski, Vinko and Toussaint, Nora C and Tuncel, Mustafa A and Tusup, Marina and Drogen, Audrey Van and Vetter, Marcus and Vlajnic, Tatjana and Weber, Sandra and Weber, Walter P and Wegmann, Rebekka and Weller, Michael and Wendt, Fabian and Wey, Norbert and Wicki, Andreas and Wollscheid, Bernd and Yu, Shuqing and Ziegler, Johanna and Zimmermann, Marc and Zoche, Martin and Zuend, Gregor and Rätsch, Gunnar and Lehmann, Kjong-Van},
  issn         = {1367-4811},
  journal      = {Bioinformatics},
  keywords     = {Computational Mathematics, Computational Theory and Mathematics, Computer Science Applications, Molecular Biology, Biochemistry, Statistics and Probability},
  number       = {Supplement_2},
  pages        = {i919--i927},
  publisher    = {Oxford University Press},
  title        = {{SCIM: Universal single-cell matching with unpaired feature sets}},
  doi          = {10.1093/bioinformatics/btaa843},
  volume       = {36},
  year         = {2020},
}

@inproceedings{14186,
  abstract     = {The goal of the unsupervised learning of disentangled representations is to
separate the independent explanatory factors of variation in the data without
access to supervision. In this paper, we summarize the results of Locatello et
al., 2019, and focus on their implications for practitioners. We discuss the
theoretical result showing that the unsupervised learning of disentangled
representations is fundamentally impossible without inductive biases and the
practical challenges it entails. Finally, we comment on our experimental
findings, highlighting the limitations of state-of-the-art approaches and
directions for future research.},
  author       = {Locatello, Francesco and Bauer, Stefan and Lucic, Mario and Rätsch, Gunnar and Gelly, Sylvain and Schölkopf, Bernhard and Bachem, Olivier},
  booktitle    = {The 34th AAAI Conference on Artificial Intelligence},
  isbn         = {9781577358350},
  issn         = {2374-3468},
  location     = {New York, NY, United States},
  number       = {9},
  pages        = {13681--13684},
  publisher    = {Association for the Advancement of Artificial Intelligence},
  title        = {{A commentary on the unsupervised learning of disentangled representations}},
  doi          = {10.1609/aaai.v34i09.7120},
  volume       = {34},
  year         = {2020},
}