@inproceedings{14187,
  abstract     = {We propose a novel Stochastic Frank-Wolfe (a.k.a. conditional gradient)
algorithm for constrained smooth finite-sum minimization with a generalized
linear prediction/structure. This class of problems includes empirical risk
minimization with sparse, low-rank, or other structured constraints. The
proposed method is simple to implement, does not require step-size tuning, and
has a constant per-iteration cost that is independent of the dataset size.
Furthermore, as a byproduct of the method we obtain a stochastic estimator of
the Frank-Wolfe gap that can be used as a stopping criterion. Depending on the
setting, the proposed method matches or improves on the best computational
guarantees for Stochastic Frank-Wolfe algorithms. Benchmarks on several
datasets highlight different regimes in which the proposed method exhibits a
faster empirical convergence than related methods. Finally, we provide an
implementation of all considered methods in an open-source package.},
  author       = {Négiar, Geoffrey and Dresdner, Gideon and Tsai, Alicia and Ghaoui, Laurent El and Locatello, Francesco and Freund, Robert M. and Pedregosa, Fabian},
  booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {7253--7262},
  title        = {{Stochastic Frank-Wolfe for constrained finite-sum minimization}},
  volume       = {119},
  year         = {2020},
}

@inproceedings{14188,
  abstract     = {Intelligent agents should be able to learn useful representations by
observing changes in their environment. We model such observations as pairs of
non-i.i.d. images sharing at least one of the underlying factors of variation.
First, we theoretically show that only knowing how many factors have changed,
but not which ones, is sufficient to learn disentangled representations.
Second, we provide practical algorithms that learn disentangled representations
from pairs of images without requiring annotation of groups, individual
factors, or the number of factors that have changed. Third, we perform a
large-scale empirical study and show that such pairs of observations are
sufficient to reliably learn disentangled representations on several benchmark
data sets. Finally, we evaluate our learned representations and find that they
are simultaneously useful on a diverse suite of tasks, including generalization
under covariate shifts, fairness, and abstract reasoning. Overall, our results
demonstrate that weak supervision enables learning of useful disentangled
representations in realistic scenarios.},
  author       = {Locatello, Francesco and Poole, Ben and Rätsch, Gunnar and Schölkopf, Bernhard and Bachem, Olivier and Tschannen, Michael},
  booktitle    = {Proceedings of the 37th International Conference on Machine Learning},
  location     = {Virtual},
  pages        = {6348–6359},
  title        = {{Weakly-supervised disentanglement without compromises}},
  volume       = {119},
  year         = {2020},
}

@article{14195,
  abstract     = {The idea behind the unsupervised learning of disentangled representations is that real-world data is generated by a few explanatory factors of variation which can be recovered by unsupervised learning algorithms. In this paper, we provide a sober look at recent progress in the field and challenge some common assumptions. We first theoretically show that the unsupervised learning of disentangled representations is fundamentally impossible without inductive biases on both the models and the data. Then, we train over 14000
 models covering most prominent methods and evaluation metrics in a reproducible large-scale experimental study on eight data sets. We observe that while the different methods successfully enforce properties “encouraged” by the corresponding losses, well-disentangled models seemingly cannot be identified without supervision. Furthermore, different evaluation metrics do not always agree on what should be considered “disentangled” and exhibit systematic differences in the estimation. Finally, increased disentanglement does not seem to necessarily lead to a decreased sample complexity of learning for downstream tasks. Our results suggest that future work on disentanglement learning should be explicit about the role of inductive biases and (implicit) supervision, investigate concrete benefits of enforcing disentanglement of the learned representations, and consider a reproducible experimental setup covering several data sets.},
  author       = {Locatello, Francesco and Bauer, Stefan and Lucic, Mario and Rätsch, Gunnar and Gelly, Sylvain and Schölkopf, Bernhard and Bachem, Olivier},
  journal      = {Journal of Machine Learning Research},
  publisher    = {MIT Press},
  title        = {{A sober look at the unsupervised learning of disentangled representations and their evaluation}},
  volume       = {21},
  year         = {2020},
}

@inproceedings{14326,
  abstract     = {Learning object-centric representations of complex scenes is a promising step towards enabling efficient abstract reasoning from low-level perceptual features. Yet, most deep learning approaches learn distributed representations that do not capture the compositional properties of natural scenes. In this paper, we present the Slot Attention module, an architectural component that interfaces with perceptual representations such as the output of a convolutional neural network and produces a set of task-dependent abstract representations which we call slots. These slots are exchangeable and can bind to any object in the input by specializing through a competitive procedure over multiple rounds of attention. We empirically demonstrate that Slot Attention can extract object-centric representations that enable generalization to unseen compositions when trained on unsupervised object discovery and supervised property prediction tasks.

},
  author       = {Locatello, Francesco and Weissenborn, Dirk and Unterthiner, Thomas and Mahendran, Aravindh and Heigold, Georg and Uszkoreit, Jakob and Dosovitskiy, Alexey and Kipf, Thomas},
  booktitle    = {34th International Conference on Neural Information Processing Systems},
  isbn         = {9781713829546},
  issn         = {1049-5258},
  location     = {Virtual},
  pages        = {11525--11538},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Object-centric learning with slot attention}},
  volume       = {33},
  year         = {2020},
}

@inproceedings{14184,
  abstract     = {Learning disentangled representations is considered a cornerstone problem in
representation learning. Recently, Locatello et al. (2019) demonstrated that
unsupervised disentanglement learning without inductive biases is theoretically
impossible and that existing inductive biases and unsupervised methods do not
allow to consistently learn disentangled representations. However, in many
practical settings, one might have access to a limited amount of supervision,
for example through manual labeling of (some) factors of variation in a few
training examples. In this paper, we investigate the impact of such supervision
on state-of-the-art disentanglement methods and perform a large scale study,
training over 52000 models under well-defined and reproducible experimental
conditions. We observe that a small number of labeled examples (0.01--0.5\% of
the data set), with potentially imprecise and incomplete labels, is sufficient
to perform model selection on state-of-the-art unsupervised models. Further, we
investigate the benefit of incorporating supervision into the training process.
Overall, we empirically validate that with little and imprecise supervision it
is possible to reliably learn disentangled representations.},
  author       = {Locatello, Francesco and Tschannen, Michael and Bauer, Stefan and Rätsch, Gunnar and Schölkopf, Bernhard and Bachem, Olivier},
  booktitle    = {8th International Conference on Learning Representations},
  location     = {Virtual},
  title        = {{Disentangling factors of variation using few labels}},
  year         = {2019},
}

@inproceedings{14189,
  abstract     = {We consider the problem of recovering a common latent source with independent
components from multiple views. This applies to settings in which a variable is
measured with multiple experimental modalities, and where the goal is to
synthesize the disparate measurements into a single unified representation. We
consider the case that the observed views are a nonlinear mixing of
component-wise corruptions of the sources. When the views are considered
separately, this reduces to nonlinear Independent Component Analysis (ICA) for
which it is provably impossible to undo the mixing. We present novel
identifiability proofs that this is possible when the multiple views are
considered jointly, showing that the mixing can theoretically be undone using
function approximators such as deep neural networks. In contrast to known
identifiability results for nonlinear ICA, we prove that independent latent
sources with arbitrary mixing can be recovered as long as multiple,
sufficiently different noisy views are available.},
  author       = {Gresele, Luigi and Rubenstein, Paul K. and Mehrjou, Arash and Locatello, Francesco and Schölkopf, Bernhard},
  booktitle    = {Proceedings of the 35th Conference on Uncertainty in Artificial  Intelligence},
  location     = {Tel Aviv, Israel},
  pages        = {217--227},
  publisher    = {ML Research Press},
  title        = {{The incomplete Rosetta Stone problem: Identifiability results for multi-view nonlinear ICA}},
  volume       = {115},
  year         = {2019},
}

@inproceedings{14190,
  abstract     = {Learning meaningful and compact representations with disentangled semantic
aspects is considered to be of key importance in representation learning. Since
real-world data is notoriously costly to collect, many recent state-of-the-art
disentanglement models have heavily relied on synthetic toy data-sets. In this
paper, we propose a novel data-set which consists of over one million images of
physical 3D objects with seven factors of variation, such as object color,
shape, size and position. In order to be able to control all the factors of
variation precisely, we built an experimental platform where the objects are
being moved by a robotic arm. In addition, we provide two more datasets which
consist of simulations of the experimental setup. These datasets provide for
the first time the possibility to systematically investigate how well different
disentanglement methods perform on real data in comparison to simulation, and
how simulated data can be leveraged to build better representations of the real
world. We provide a first experimental study of these questions and our results
indicate that learned models transfer poorly, but that model and hyperparameter
selection is an effective means of transferring information to the real world.},
  author       = {Gondal, Muhammad Waleed and Wüthrich, Manuel and Miladinović, Đorđe and Locatello, Francesco and Breidt, Martin and Volchkov, Valentin and Akpo, Joel and Bachem, Olivier and Schölkopf, Bernhard and Bauer, Stefan},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  title        = {{On the transfer of inductive bias from simulation to the real world: a new disentanglement dataset}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{14191,
  abstract     = {A broad class of convex optimization problems can be formulated as a semidefinite program (SDP), minimization of a convex function over the positive-semidefinite cone subject to some affine constraints. The majority of classical SDP solvers are designed for the deterministic setting where problem data is readily available. In this setting, generalized conditional gradient methods (aka Frank-Wolfe-type methods) provide scalable solutions by leveraging the so-called linear minimization oracle instead of the projection onto the semidefinite cone. Most problems in machine learning and modern engineering applications, however, contain some degree of stochasticity. In this work, we propose the first conditional-gradient-type method for solving stochastic optimization problems under affine constraints. Our method guarantees O(k−1/3) convergence rate in expectation on the objective residual and O(k−5/12) on the feasibility gap.},
  author       = {Locatello, Francesco and Yurtsever, Alp and Fercoq, Olivier and Cevher, Volkan},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  pages        = {14291–14301},
  title        = {{Stochastic Frank-Wolfe for composite convex minimization}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{14193,
  abstract     = {A disentangled representation encodes information about the salient factors
of variation in the data independently. Although it is often argued that this
representational format is useful in learning to solve many real-world
down-stream tasks, there is little empirical evidence that supports this claim.
In this paper, we conduct a large-scale study that investigates whether
disentangled representations are more suitable for abstract reasoning tasks.
Using two new tasks similar to Raven's Progressive Matrices, we evaluate the
usefulness of the representations learned by 360 state-of-the-art unsupervised
disentanglement models. Based on these representations, we train 3600 abstract
reasoning models and observe that disentangled representations do in fact lead
to better down-stream performance. In particular, they enable quicker learning
using fewer samples.},
  author       = {Steenkiste, Sjoerd van and Locatello, Francesco and Schmidhuber, Jürgen and Bachem, Olivier},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  title        = {{Are disentangled representations helpful for abstract visual reasoning?}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{14197,
  abstract     = {Recently there has been a significant interest in learning disentangled
representations, as they promise increased interpretability, generalization to
unseen scenarios and faster learning on downstream tasks. In this paper, we
investigate the usefulness of different notions of disentanglement for
improving the fairness of downstream prediction tasks based on representations.
We consider the setting where the goal is to predict a target variable based on
the learned representation of high-dimensional observations (such as images)
that depend on both the target variable and an \emph{unobserved} sensitive
variable. We show that in this setting both the optimal and empirical
predictions can be unfair, even if the target variable and the sensitive
variable are independent. Analyzing the representations of more than
\num{12600} trained state-of-the-art disentangled models, we observe that
several disentanglement scores are consistently correlated with increased
fairness, suggesting that disentanglement may be a useful property to encourage
fairness when sensitive variables are not observed.},
  author       = {Locatello, Francesco and Abbati, Gabriele and Rainforth, Tom and Bauer, Stefan and Schölkopf, Bernhard and Bachem, Olivier},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781713807933},
  location     = {Vancouver, Canada},
  pages        = {14611–14624},
  title        = {{On the fairness of disentangled representations}},
  volume       = {32},
  year         = {2019},
}

@inproceedings{14200,
  abstract     = {The key idea behind the unsupervised learning of disentangled representations
is that real-world data is generated by a few explanatory factors of variation
which can be recovered by unsupervised learning algorithms. In this paper, we
provide a sober look at recent progress in the field and challenge some common
assumptions. We first theoretically show that the unsupervised learning of
disentangled representations is fundamentally impossible without inductive
biases on both the models and the data. Then, we train more than 12000 models
covering most prominent methods and evaluation metrics in a reproducible
large-scale experimental study on seven different data sets. We observe that
while the different methods successfully enforce properties ``encouraged'' by
the corresponding losses, well-disentangled models seemingly cannot be
identified without supervision. Furthermore, increased disentanglement does not
seem to lead to a decreased sample complexity of learning for downstream tasks.
Our results suggest that future work on disentanglement learning should be
explicit about the role of inductive biases and (implicit) supervision,
investigate concrete benefits of enforcing disentanglement of the learned
representations, and consider a reproducible experimental setup covering
several data sets.},
  author       = {Locatello, Francesco and Bauer, Stefan and Lucic, Mario and Rätsch, Gunnar and Gelly, Sylvain and Schölkopf, Bernhard and Bachem, Olivier},
  booktitle    = {Proceedings of the 36th International Conference on Machine Learning},
  location     = {Long Beach, CA, United States},
  pages        = {4114--4124},
  publisher    = {ML Research Press},
  title        = {{Challenging common assumptions in the unsupervised learning of disentangled representations}},
  volume       = {97},
  year         = {2019},
}

@inproceedings{14198,
  abstract     = {High-dimensional time series are common in many domains. Since human
cognition is not optimized to work well in high-dimensional spaces, these areas
could benefit from interpretable low-dimensional representations. However, most
representation learning algorithms for time series data are difficult to
interpret. This is due to non-intuitive mappings from data features to salient
properties of the representation and non-smoothness over time. To address this
problem, we propose a new representation learning framework building on ideas
from interpretable discrete dimensionality reduction and deep generative
modeling. This framework allows us to learn discrete representations of time
series, which give rise to smooth and interpretable embeddings with superior
clustering performance. We introduce a new way to overcome the
non-differentiability in discrete representation learning and present a
gradient-based version of the traditional self-organizing map algorithm that is
more performant than the original. Furthermore, to allow for a probabilistic
interpretation of our method, we integrate a Markov model in the representation
space. This model uncovers the temporal transition structure, improves
clustering performance even further and provides additional explanatory
insights as well as a natural representation of uncertainty. We evaluate our
model in terms of clustering performance and interpretability on static
(Fashion-)MNIST data, a time series of linearly interpolated (Fashion-)MNIST
images, a chaotic Lorenz attractor system with two macro states, as well as on
a challenging real world medical time series application on the eICU data set.
Our learned representations compare favorably with competitor methods and
facilitate downstream tasks on the real world data.},
  author       = {Fortuin, Vincent and Hüser, Matthias and Locatello, Francesco and Strathmann, Heiko and Rätsch, Gunnar},
  booktitle    = {International Conference on Learning Representations},
  location     = {New Orleans, LA, United States},
  title        = {{SOM-VAE: Interpretable discrete representation learning on time series}},
  year         = {2018},
}

@inproceedings{14201,
  abstract     = {Variational inference is a popular technique to approximate a possibly
intractable Bayesian posterior with a more tractable one. Recently, boosting
variational inference has been proposed as a new paradigm to approximate the
posterior by a mixture of densities by greedily adding components to the
mixture. However, as is the case with many other variational inference
algorithms, its theoretical properties have not been studied. In the present
work, we study the convergence properties of this approach from a modern
optimization viewpoint by establishing connections to the classic Frank-Wolfe
algorithm. Our analyses yields novel theoretical insights regarding the
sufficient conditions for convergence, explicit rates, and algorithmic
simplifications. Since a lot of focus in previous works for variational
inference has been on tractability, our work is especially important as a much
needed attempt to bridge the gap between probabilistic models and their
corresponding theoretical properties.},
  author       = {Locatello, Francesco and Khanna, Rajiv and Ghosh, Joydeep and Rätsch, Gunnar},
  booktitle    = {Proceedings of the 21st International Conference on Artificial Intelligence and Statistics},
  location     = {Playa Blanca, Lanzarote},
  pages        = {464--472},
  publisher    = {ML Research Press},
  title        = {{Boosting variational inference: An optimization perspective}},
  volume       = {84},
  year         = {2018},
}

@inproceedings{14202,
  abstract     = {Approximating a probability density in a tractable manner is a central task
in Bayesian statistics. Variational Inference (VI) is a popular technique that
achieves tractability by choosing a relatively simple variational family.
Borrowing ideas from the classic boosting framework, recent approaches attempt
to \emph{boost} VI by replacing the selection of a single density with a
greedily constructed mixture of densities. In order to guarantee convergence,
previous works impose stringent assumptions that require significant effort for
practitioners. Specifically, they require a custom implementation of the greedy
step (called the LMO) for every probabilistic model with respect to an
unnatural variational family of truncated distributions. Our work fixes these
issues with novel theoretical and algorithmic insights. On the theoretical
side, we show that boosting VI satisfies a relaxed smoothness assumption which
is sufficient for the convergence of the functional Frank-Wolfe (FW) algorithm.
Furthermore, we rephrase the LMO problem and propose to maximize the Residual
ELBO (RELBO) which replaces the standard ELBO optimization in VI. These
theoretical enhancements allow for black box implementation of the boosting
subroutine. Finally, we present a stopping criterion drawn from the duality gap
in the classic FW analyses and exhaustive experiments to illustrate the
usefulness of our theoretical and algorithmic contributions.},
  author       = {Locatello, Francesco and Dresdner, Gideon and Khanna, Rajiv and Valera, Isabel and Rätsch, Gunnar},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781510884472},
  issn         = {1049-5258},
  location     = {Montreal, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Boosting black box variational inference}},
  volume       = {31},
  year         = {2018},
}

@inproceedings{14203,
  abstract     = {We propose a conditional gradient framework for a composite convex minimization template with broad applications. Our approach combines smoothing and homotopy techniques under the CGM framework, and provably achieves the optimal O(1/k−−√) convergence rate. We demonstrate that the same rate holds if the linear subproblems are solved approximately with additive or multiplicative error. In contrast with the relevant work, we are able to characterize the convergence when the non-smooth term is an indicator function. Specific applications of our framework include the non-smooth minimization, semidefinite programming, and minimization with linear inclusion constraints over a compact domain. Numerical evidence demonstrates the benefits of our framework.},
  author       = {Yurtsever, Alp and Fercoq, Olivier and Locatello, Francesco and Cevher, Volkan},
  booktitle    = {Proceedings of the 35th International Conference on Machine Learning},
  location     = {Stockholm, Sweden},
  pages        = {5727--5736},
  publisher    = {ML Research Press},
  title        = {{A conditional gradient framework for composite convex minimization with applications to semidefinite programming}},
  volume       = {80},
  year         = {2018},
}

@inproceedings{14204,
  abstract     = {Two popular examples of first-order optimization methods over linear spaces are coordinate descent and matching pursuit algorithms, with their randomized variants. While the former targets the optimization by moving along coordinates, the latter considers a generalized notion of directions. Exploiting the connection between the two algorithms, we present a unified analysis of both, providing affine invariant sublinear O(1/t) rates on smooth objectives and linear convergence on strongly convex objectives. As a byproduct of our affine invariant analysis of matching pursuit, our rates for steepest coordinate descent are the tightest known. Furthermore, we show the first accelerated convergence rate O(1/t2) for matching pursuit and steepest coordinate descent on convex objectives.},
  author       = {Locatello, Francesco and Raj, Anant and Karimireddy, Sai Praneeth and Rätsch, Gunnar and Schölkopf, Bernhard and Stich, Sebastian U. and Jaggi, Martin},
  booktitle    = {Proceedings of the 35th International Conference on Machine Learning},
  pages        = {3198--3207},
  publisher    = {ML Research Press},
  title        = {{On matching pursuit and coordinate descent}},
  volume       = {80},
  year         = {2018},
}

@inproceedings{14224,
  abstract     = {Clustering is a cornerstone of unsupervised learning which can be thought as disentangling multiple generative mechanisms underlying the data. In this paper we introduce an algorithmic framework to train mixtures of implicit generative models which we particularize for variational autoencoders. Relying on an additional set of discriminators, we propose a competitive procedure in which the models only need to approximate the portion of the data distribution from which they can produce realistic samples. As a byproduct, each model is simpler to train, and a clustering interpretation arises naturally from the partitioning of the training points among the models. We empirically show that our approach splits the training distribution in a reasonable way and increases the quality of the generated samples.},
  author       = {Locatello, Francesco and Vincent, Damien and Tolstikhin, Ilya and Ratsch, Gunnar and Gelly, Sylvain and Scholkopf, Bernhard},
  booktitle    = {6th International Conference on Learning Representations},
  location     = {Vancouver, Canada},
  title        = {{Clustering meets implicit generative models}},
  year         = {2018},
}

@unpublished{14327,
  abstract     = {A common assumption in causal modeling posits that the data is generated by a
set of independent mechanisms, and algorithms should aim to recover this
structure. Standard unsupervised learning, however, is often concerned with
training a single model to capture the overall distribution or aspects thereof.
Inspired by clustering approaches, we consider mixtures of implicit generative
models that ``disentangle'' the independent generative mechanisms underlying
the data. Relying on an additional set of discriminators, we propose a
competitive training procedure in which the models only need to capture the
portion of the data distribution from which they can produce realistic samples.
As a by-product, each model is simpler and faster to train. We empirically show
that our approach splits the training distribution in a sensible way and
increases the quality of the generated samples.},
  author       = {Locatello, Francesco and Vincent, Damien and Tolstikhin, Ilya and Rätsch, Gunnar and Gelly, Sylvain and Schölkopf, Bernhard},
  booktitle    = {arXiv},
  title        = {{Competitive training of mixtures of independent deep generative models}},
  doi          = {10.48550/arXiv.1804.11130},
  year         = {2018},
}

@inproceedings{14205,
  abstract     = {Two of the most fundamental prototypes of greedy optimization are the matching pursuit and Frank-Wolfe algorithms. In this paper, we take a unified view on both classes of methods, leading to the first explicit convergence rates of matching pursuit methods in an optimization sense, for general sets of atoms. We derive sublinear (1/t) convergence for both classes on general smooth objectives, and linear convergence on strongly convex objectives, as well as a clear correspondence of algorithm variants. Our presented algorithms and rates are affine invariant, and do not need any incoherence or sparsity assumptions.},
  author       = {Locatello, Francesco and Khanna, Rajiv and Tschannen, Michael and Jaggi, Martin},
  booktitle    = {Proceedings of the 20th International Conference on Artificial Intelligence and Statistics},
  location     = {Fort Lauderdale, FL, United States},
  pages        = {860--868},
  publisher    = {ML Research Press},
  title        = {{A unified optimization view on generalized matching pursuit and Frank-Wolfe}},
  volume       = {54},
  year         = {2017},
}

@inproceedings{14206,
  abstract     = {Greedy optimization methods such as Matching Pursuit (MP) and Frank-Wolfe (FW) algorithms regained popularity in recent years due to their simplicity, effectiveness and theoretical guarantees. MP and FW address optimization over the linear span and the convex hull of a set of atoms, respectively. In this paper, we consider the intermediate case of optimization over the convex cone, parametrized as the conic hull of a generic atom set, leading to the first principled definitions of non-negative MP algorithms for which we give explicit convergence rates and demonstrate excellent empirical performance. In particular, we derive sublinear (O(1/t)) convergence on general smooth and convex objectives, and linear convergence (O(e−t)) on strongly convex objectives, in both cases for general sets of atoms. Furthermore, we establish a clear correspondence of our algorithms to known algorithms from the MP and FW literature. Our novel algorithms and analyses target general atom sets and general objective functions, and hence are directly applicable to a large variety of learning settings.},
  author       = {Locatello, Francesco and Tschannen, Michael and Rätsch, Gunnar and Jaggi, Martin},
  booktitle    = {Advances in Neural Information Processing Systems},
  isbn         = {9781510860964},
  location     = {Long Beach, CA, United States},
  title        = {{Greedy algorithms for cone constrained optimization with convergence guarantees}},
  year         = {2017},
}