@inproceedings{18977,
  abstract     = {Recent advances in large language model (LLM) pretraining have led to high-quality LLMs with impressive abilities. By compressing such LLMs via quantization to 3-4 bits per parameter, they can fit into memory-limited devices such as laptops and mobile phones, enabling personalized use. Quantizing models to 3-4 bits per parameter can lead to moderate to high accuracy losses, especially for smaller models (1-10B parameters), which are suitable for edge deployment. To address this accuracy issue, we introduce the Sparse-Quantized Representation (SpQR), a new compressed format and quantization technique that enables for the first time \emph{near-lossless} compression of LLMs across model scales while reaching similar compression levels to previous methods. SpQR works by identifying and isolating \emph{outlier weights}, which cause particularly large quantization errors, and storing them in higher precision while compressing all other weights to 3-4 bits, and achieves relative accuracy losses of less than 
 in perplexity for highly-accurate LLaMA and Falcon LLMs. This makes it possible to run a 33B parameter LLM on a single 24 GB consumer GPU without performance degradation at 15% speedup, thus making powerful LLMs available to consumers without any downsides. SpQR comes with efficient algorithms for both encoding weights into its format, as well as decoding them efficiently at runtime. Specifically, we provide an efficient GPU inference algorithm for SpQR, which yields faster inference than 16-bit baselines at similar accuracy while enabling memory compression gains of more than 4x.},
  author       = {Dettmers, Tim and Svirschevski, Ruslan A. and Egiazarian, Vage and Kuznedelev, Denis and Frantar, Elias and Ashkboos, Saleh and Borzunov, Alexander and Hoefler, Torsten and Alistarh, Dan-Adrian},
  booktitle    = {12th International Conference on Learning Representations},
  location     = {Vienna, Austria},
  publisher    = {OpenReview},
  title        = {{SpQR: A sparse-quantized representation for near-lossless LLM weight compression}},
  year         = {2024},
}

@inproceedings{18996,
  abstract     = {We consider the linear causal representation learning setting where we observe a linear mixing of d unknown latent factors, which follow a linear structural causal model. Recent work has shown that it is possible to recover the latent factors as well as the underlying structural causal model over them, up to permutation and scaling, provided that we have at least d environments, each of which corresponds to perfect interventions on a single latent node (factor). After this powerful result, a key open problem faced by the community has been to relax these conditions: allow for coarser than perfect single-node interventions, and allow for fewer than d of them, since the number of latent factors d could be very large. In this work, we consider precisely such a setting, where we allow a smaller than d number of environments, and also allow for very coarse interventions that can very coarsely \textit{change the entire causal graph over the latent factors}. On the flip side, we relax what we wish to extract to simply the \textit{list of nodes that have shifted between one or more environments}. We provide a surprising identifiability result that it is indeed possible, under some very mild standard assumptions, to identify the set of shifted nodes. Our identifiability proof moreover is a constructive one: we explicitly provide necessary and sufficient conditions for a node to be a shifted node, and show that we can check these conditions given observed data. Our algorithm lends itself very naturally to the sample setting where instead of just interventional distributions, we are provided datasets of samples from each of these distributions. We corroborate our results on both synthetic experiments as well as an interesting psychometric dataset. The code can be found at https://github.com/TianyuCodings/iLCS.},
  author       = {Chen, Tianyu and Bello, Kevin and Locatello, Francesco and Aragam, Bryon and Ravikumar, Pradeep Kumar},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Identifying general mechanism shifts in linear causal representations}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{18998,
  abstract     = {Word embeddings represent language vocabularies as clouds of d-dimensional points. We investigate how information is conveyed by the general shape of these clouds, instead of representing the semantic meaning of each token. Specifically, we use the notion of persistent homology from topological data analysis (TDA) to measure the distances between language pairs from the shape of their unlabeled embeddings. These distances quantify the degree of non-isometry of the embeddings. To distinguish whether these differences are random training errors or capture real information about the languages, we use the computed distance matrices to construct language phylogenetic trees over 81 Indo-European languages. Careful evaluation shows that our reconstructed trees exhibit strong and statistically-significant similarities to the reference.},
  author       = {Draganov, Ondrej and Skiena, Steven},
  booktitle    = {Findings of the Association for Computational Linguistics: EMNLP 2024},
  location     = {Miami, FL, United States},
  pages        = {12080--12099},
  publisher    = {Association for Computational Linguistics},
  title        = {{The shape of word embeddings: Quantifying non-isometry with topological data analysis}},
  doi          = {10.18653/v1/2024.findings-emnlp.705},
  year         = {2024},
}

@unpublished{18999,
  abstract     = {Exploring the shape of point configurations has been a key driver in the evolution of TDA (short for topological data analysis) since its infancy. This survey illustrates the recent efforts to broaden these ideas to model spatial interactions among multiple configurations, each distinguished by a color. It describes advances in this area and prepares the ground for further exploration by mentioning unresolved questions and promising research avenues while focusing on the overlap with discrete geometry.},
  author       = {Cultrera di Montesano, Sebastiano and Draganov, Ondrej and Edelsbrunner, Herbert and Saghafian, Morteza},
  booktitle    = {arXiv},
  title        = {{Chromatic topological data analysis}},
  doi          = {10.48550/ARXIV.2406.04102},
  year         = {2024},
}

@inproceedings{19005,
  abstract     = {Causal representation learning promises to extend causal models to hidden causal
variables from raw entangled measurements. However, most progress has focused
on proving identifiability results in different settings, and we are not aware of any
successful real-world application. At the same time, the field of dynamical systems
benefited from deep learning and scaled to countless applications but does not allow
parameter identification. In this paper, we draw a clear connection between the two
and their key assumptions, allowing us to apply identifiable methods developed
in causal representation learning to dynamical systems. At the same time, we can
leverage scalable differentiable solvers developed for differential equations to build
models that are both identifiable and practical. Overall, we learn explicitly controllable models that isolate the trajectory-specific parameters for further downstream
tasks such as out-of-distribution classification or treatment effect estimation. We
experiment with a wind simulator with partially known factors of variation. We
also apply the resulting model to real-world climate data and successfully answer
downstream causal questions in line with existing literature on climate change.
Code is available at https://github.com/CausalLearningAI/crl-dynamical-systems.},
  author       = {Yao, Dingling and Muller, Caroline J and Locatello, Francesco},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Marrying causal representation learning with dynamical systems for science}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19007,
  abstract     = {Learning modular object-centric representations is crucial for systematic generalization. Existing methods show promising object-binding capabilities empirically,
but theoretical identifiability guarantees remain relatively underdeveloped. Understanding when object-centric representations can theoretically be identified is
crucial for scaling slot-based methods to high-dimensional images with correctness
guarantees. To that end, we propose a probabilistic slot-attention algorithm that
imposes an aggregate mixture prior over object-centric slot representations, thereby
providing slot identifiability guarantees without supervision, up to an equivalence
relation. We provide empirical verification of our theoretical identifiability result
using both simple 2-dimensional data and high-resolution imaging datasets.
},
  author       = {Kori, Avinash and Locatello, Francesco and Santhirasekaram, Ainkaran and Toni, Francesca and Glocker, Ben and De Sousa Ribeiro, Fabio},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Identifiable object-centric representation learning via probabilistic slot attention}},
  volume       = {37},
  year         = {2024},
}

@unpublished{19013,
  abstract     = {We study the singularities of the moduli space of degree e maps from smooth genus g curves to an arbitrary smooth hypersurface of low degree. For e large compared to g, we show that these moduli spaces have at worst terminal singularities. Our main approach is to study the jet schemes of these moduli spaces by developing a suitable form of the circle method.},
  author       = {Glas, Jakob and Hase-Liu, Matthew },
  booktitle    = {arXiv},
  title        = {{Terminal singularities of the moduli space of curves on low degree hypersurfaces and the circle method}},
  doi          = {10.48550/arXiv.2412.14923},
  year         = {2024},
}

@inproceedings{19028,
  abstract     = {The stochastic nature of modern Monte Carlo (MC) rendering methods inevitably produces noise in rendered images for a practical number of samples per pixel. The problem of denoising these images has been widely studied, with most recent methods relying on data-driven, pretrained neural networks. In contrast, in this paper we propose a statistical approach to the denoising problem, treating each pixel as a random variable and reasoning about its distribution. Considering a pixel of the noisy rendered image, we formulate fast pair-wise statistical tests—based on online estimators—to decide which of the nearby pixels to exclude from the denoising filter. We show that for symmetric pixel weights and normally distributed samples, the classical Welch t-test is optimal in terms of mean squared error. We then show how to extend this result to handle non-normal distributions, using more recent confidence-interval formulations in combination with the Box-Cox transformation. Our results show that our statistical denoising approach matches the performance of state-of-the-art neural image denoising without having to resort to any computation-intensive pretraining. Furthermore, our approach easily generalizes to other quantities besides pixel intensity, which we demonstrate by showing additional applications to Russian roulette path termination and multiple importance sampling.},
  author       = {Sakai, Hiroyuki and Freude, Christian and Auzinger, Thomas and Hahn, David and Wimmer, Michael},
  booktitle    = {Proceedings - SIGGRAPH Asia 2024 Conference Papers},
  isbn         = {9798400711312},
  location     = {Tokyo, Japan},
  publisher    = {Association for Computing Machinery},
  title        = {{A statistical approach to Monte Carlo denoising}},
  doi          = {10.1145/3680528.3687591},
  year         = {2024},
}

@article{19051,
  abstract     = {This paper corrects an error in an earlier work of the author.},
  author       = {Browning, Timothy D},
  issn         = {1687-0247},
  journal      = {International Mathematics Research Notices},
  number       = {13},
  pages        = {10165--10168},
  publisher    = {Oxford University Press},
  title        = {{The polynomial sieve and equal sums of like polynomials}},
  doi          = {10.1093/imrn/rnae066},
  volume       = {2024},
  year         = {2024},
}

@unpublished{19063,
  abstract     = {Instruction-tuned Large Language Models (LLMs) show impressive results in numerous practical applications, but they lack essential safety features that are common in other areas of computer science, particularly an explicit separation of instructions and data. This makes them vulnerable to manipulations such as indirect prompt injections and generally unsuitable for safety-critical tasks. Surprisingly, there is currently no established definition or benchmark to quantify this phenomenon. In this work, we close this gap by introducing a formal measure for instruction-data separation and an empirical variant that is calculable from a model's outputs. We also present a new dataset, SEP, that allows estimating the measure for real-world models. Our results on various LLMs show that the problem of instruction-data separation is real: all models fail to achieve high separation, and canonical mitigation techniques, such as prompt engineering and fine-tuning, either fail to substantially improve separation or reduce model utility. The source code and SEP dataset are openly accessible at https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.
},
  author       = {Zverev, Egor and Abdelnabi, Sahar and Tabesh, Soroush and Fritz, Mario and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{Can LLMs separate instructions from data? And what do we even mean by that?}},
  doi          = {10.48550/arXiv.2403.06833},
  year         = {2024},
}

@misc{19307,
  abstract     = {This repository contains the data, scripts, SAM codes and files required to reproduce the results of the manuscript "The Unreasonable Efficiency of Total Rain Evaporation Removal in Triggering Convective Self-Aggregation" submitted to the Geophysical Research Letters (GRL).

Brief description of project: This project aims to examine the impact of rain evaporation removal or reduction in the planetary boundary layer (PBL) on convective self aggregation (CSA). Non-rotating radiative-convective equilibrium (RCE) simulations were conducted with the System for Atmospheric Modeling (SAM) cloud resolving model. Rain evaporation in the lowest 1 km was progressively reduced and the effect on CSA was investigated. The physical processes underlying this type of aggregation (referred to in the manuscript as no-evaporation CSA, or NE-CSA) were analyzed and described. 
The default SAM code base (version 6.10.8) can be downloaded from here: http://rossby.msrc.sunysb.edu/~marat/SAM.html},
  author       = {Hwong, Yi-Ling and Muller, Caroline J},
  publisher    = {Zenodo},
  title        = {{Data - The unreasonable efficiency of total rain evaporation removal in triggering convective self-aggregation}},
  doi          = {10.5281/ZENODO.10687169},
  year         = {2024},
}

@article{19408,
  abstract     = {Continual learning is a subfield of machine learning, which aims to allow machine learning models to continuously learn on new data, by accumulating knowledge without forgetting what was learned in the past. In this work, we take a step back, and ask: "Why should one care about continual learning in the first place?". We set the stage by examining recent continual learning papers published at four major machine learning conferences, and show that memory-constrained settings dominate the field. Then, we discuss five open problems in machine learning, and even though they might seem unrelated to continual learning at first sight, we show that continual learning will inevitably be part of their solution. These problems are model editing, personalization and specialization, on-device learning, faster (re-)training and reinforcement learning. Finally, by comparing the desiderata from these unsolved problems and the current assumptions in continual learning, we highlight and discuss four future directions for continual learning research. We hope that this work offers an interesting perspective on the future of continual learning, while displaying its potential value and the paths we have to pursue in order to make it successful. This work is the result of the many discussions the authors had at the Dagstuhl seminar on Deep Continual Learning, in March 2023.},
  author       = {Verwimp, Eli and Aljundi, Rahaf and Ben-David, Shai and Bethge, Matthias and Cossu, Andrea and Gepperth, Alexander and Hayes, Tyler L. and Hüllermeier, Eyke and Kanan, Christopher and Kudithipudi, Dhireesha and Lampert, Christoph and Mundt, Martin and Pascanu, Razvan and Popescu, Adrian and Tolias, Andreas S. and Van De Weijer, Joost and Liu, Bing and Lomonaco, Vincenzo and Tuytelaars, Tinne and Van De Ven, Gido M.},
  issn         = {2835-8856},
  journal      = {Transactions on Machine Learning Research},
  publisher    = {Transactions on Machine Learning Research},
  title        = {{Continual learning: Applications and the road forward}},
  volume       = {2024},
  year         = {2024},
}

@article{19446,
  abstract     = {This Comment explores new approaches to enrich large-scale population data, including incorporating macro-environmental and digital health measures.},
  author       = {Nees, Frauke and Renner, Paul and Holz, Nathalie E. and Polemiti, Elli and Siehl, Sebastian and Hese, Sören and Schepanski, Kerstin and Schumann, Gunter and Walter, Henrik and Heinz, Andreas and Ralser, Markus and Twardziok, Sven and Vaidya, Nilakshi and Bernas, Antoine and Serin, Emin and Jentsch, Marcel and Hitchen, Esther and Kebir, Hedi and Lett, Tristram A. and Roy, Jean Charles and Eils, Roland and Taron, Ulrike Helene and Schütz, Tatjana and Banks, Jamie and Banaschewski, Tobias and Jansone, Karina and Christmann, Nina and Meyer-Lindenberg, Andreas and Tost, Heike and Holz, Nathalie and Schwarz, Emanuel and Stringaris, Argyris and Neidhart, Maja and Seefried, Beke and Aden, Rieke and Andreassen, Ole A. and Westlye, Lars T. and Van Der Meer, Dennis and Fernandez, Sara and Kjelkenes, Rikka and Ask, Helga and Rapp, Michael and Tschorn, Mira and Böttger, Sarah Jane and Marquand, Andre and Novarino, Gaia and Marr, Lena and Slater, Mel and Viapiana, Guillem Feixas and Orosa, Francisco Eiroa and Gallego, Jaime and Pastor, Alvaro and Forstner, Andreas J. and Hoffmann, Per and Nöthen, Markus M. and Claus, Isabelle and Miller, Abigail and Mathey, Carina M. and Heilmann-Heimbach, Stefanie and Sommer, Peter and Patraskaki, Myrto and Wilbertz, Johannes and Schmitt, Karen and Jirsa, Viktor and Petkoski, Spase and Pitel, Séverine and Otten, Lisa and Athanasiadis, Anastasios Polykarpos and Pearmund, Charlie and Spanlang, Bernhard and Alvarez, Elena and Sanchez, Mavi and Giner, Arantxa and Jia, Tianye and Gong, Yanting and Xia, Yunman and Chang, Xiao and Calhoun, Vince and Liu, Jingyu and Schwalber, Ameli and Thompson, Paul and Clinton, Nicholas and Desrivières, Sylvane and Young, Allan H. and Stahl, Bernd and Ogoh, George},
  issn         = {2731-6076},
  journal      = {Nature Mental Health},
  number       = {10},
  pages        = {1124--1127},
  publisher    = {Springer Nature},
  title        = {{Large-scale population data enrichment in mental health research}},
  doi          = {10.1038/s44220-024-00316-z},
  volume       = {2},
  year         = {2024},
}

@article{19470,
  abstract     = {When food is freely available, eating occurs without energy deficit. While agouti-related peptide (AgRP) neurons are likely involved, their activation is thought to require negative energy balance. To investigate this, we implemented long-term, continuous in vivo fiber-photometry recordings in mice. We discovered new forms of AgRP neuron regulation, including fast pre-ingestive decreases in activity and unexpectedly rapid activation by fasting. Furthermore, AgRP neuron activity has a circadian rhythm that peaks concurrent with the daily feeding onset. Importantly, this rhythm persists when nutrition is provided via constant-rate gastric infusions. Hence, it is not secondary to a circadian feeding rhythm. The AgRP neuron rhythm is driven by the circadian clock, the suprachiasmatic nucleus (SCN), as SCN ablation abolishes the circadian rhythm in AgRP neuron activity and feeding. The SCN activates AgRP neurons via excitatory afferents from thyrotrophin-releasing hormone-expressing neurons in the dorsomedial hypothalamus (DMHTrh neurons) to drive daily feeding rhythms.},
  author       = {Douglass, Amelia May Barnett and Kucukdereli, Hakan and Madara, Joseph C. and Wang, Daqing and Wu, Chen and Lowenstein, Elijah D. and Tao, Jenkang and Lowell, Bradford B.},
  issn         = {1550-4131},
  journal      = {Cell Metabolism},
  number       = {3},
  pages        = {708--722.e5},
  publisher    = {Elsevier},
  title        = {{Acute and circadian feedforward regulation of agouti-related peptide hunger neurons}},
  doi          = {10.1016/j.cmet.2024.11.009},
  volume       = {37},
  year         = {2024},
}

@article{19486,
  abstract     = {Consider the family of elliptic curves En:y2=x3+n2, where n varies over positive cubefree integers. There is a rational 3-isogeny ϕ from En to E^n:y2=x3−27n2 and a dual isogeny ϕ^:E^n→En. We show that for almost all n, the rank of Selϕ(En) is 0, and the rank of Selϕ^(E^n) is determined by the number of prime factors of n that are congruent to 2mod3 and the congruence class of nmod9.},
  author       = {Chan, Yik Tung},
  issn         = {1687-0247},
  journal      = {International Mathematics Research Notices},
  number       = {9},
  pages        = {7571--7593},
  publisher    = {Oxford University Press},
  title        = {{The 3-isogeny selmer groups of the elliptic curves y2=x3+n2}},
  doi          = {10.1093/imrn/rnad266},
  volume       = {2024},
  year         = {2024},
}

@inproceedings{19510,
  abstract     = {We propose a new variant of the Adam optimizer [Kingma and Ba, 2014] called
MICROADAM that specifically minimizes memory overheads, while maintaining
theoretical convergence guarantees. We achieve this by compressing the gradient
information before it is fed into the optimizer state, thereby reducing its memory
footprint significantly. We control the resulting compression error via a novel
instance of the classical error feedback mechanism from distributed optimization [Seide et al., 2014, Alistarh et al., 2018, Karimireddy et al., 2019] in which
the error correction information is itself compressed to allow for practical memory
gains. We prove that the resulting approach maintains theoretical convergence
guarantees competitive to those of AMSGrad, while providing good practical performance. Specifically, we show that MICROADAM can be implemented efficiently
on GPUs: on both million-scale (BERT) and billion-scale (LLaMA) models, MICROADAM provides practical convergence competitive to that of the uncompressed
Adam baseline, with lower memory usage and similar running time. Our code is
available at https://github.com/IST-DASLab/MicroAdam.},
  author       = {Modoranu, Ionut-Vlad and Safaryan, Mher and Malinovsky, Grigory and Kurtic, Eldar and Robert, Thomas and Richtárik, Peter and Alistarh, Dan-Adrian},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19511,
  abstract     = {We introduce QuaRot, a new Quantization scheme based on Rotations, which is able to quantize LLMs end-to-end, including all weights, activations, and KV cache in 4 bits. QuaRot rotates LLMs in a way that removes outliers from the hidden state without changing the output, making quantization easier. This computational invariance is applied to the hidden state (residual) of the LLM, as well as to the activations of the feed-forward components, aspects of the attention mechanism, and to the KV cache. The result is a quantized model where all matrix multiplications are performed in 4 bits, without any channels identified for retention in higher precision. Our 4-bit quantized LLAMA2-70B model has losses of at most 0.47 WikiText-2 perplexity and retains 99% of the zero-shot performance. We also show that QuaRot can provide lossless 6 and 8 bit LLAMA-2 models without any calibration data using round-to-nearest quantization. Code is available at github.com/spcl/QuaRot.},
  author       = {Ashkboos, Saleh and Mohtashami, Amirkeivan and Croci, Maximilian L. and Li, Bo and Cameron, Pashmina and Jaggi, Martin and Alistarh, Dan-Adrian and Hoefler, Torsten and Hensman, James},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{QuaRot: Outlier-free 4-bit inference in rotated LLMs}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19512,
  abstract     = {Differential privacy with gradual expiration models the setting where data items
arrive in a stream and at a given time t the privacy loss guaranteed for a data item
seen at time (t − d) is εg(d), where g is a monotonically non-decreasing function.
We study the fundamental continual (binary) counting problem where each data
item consists of a bit, and the algorithm needs to output at each time step the sum of
all the bits streamed so far. For a stream of length T and privacy without expiration
continual counting is possible with maximum (over all time steps) additive error
O(log2
(T)/ε) and the best known lower bound is Ω(log(T)/ε); closing this gap
is a challenging open problem.
We show that the situation is very different for privacy with gradual expiration by
giving upper and lower bounds for a large set of expiration functions g. Specifically,
our algorithm achieves an additive error of O(log(T)/ε) for a large set of privacy
expiration functions. We also give a lower bound that shows that if C is the additive
error of any ε-DP algorithm for this problem, then the product of C and the privacy
expiration function after 2C steps must be Ω(log(T)/ε). Our algorithm matches
this lower bound as its additive error is O(log(T)/ε), even when g(2C) = O(1).
Our empirical evaluation shows that we achieve a slowly growing privacy loss
with significantly smaller empirical privacy loss for large values of d than a natural
baseline algorithm.},
  author       = {Andersson, Joel Daniel and Henzinger, Monika H and Pagh, Rasmus and Steiner, Teresa Anna and Upadhyay, Jalaj},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Continual counting with gradual privacy expiration}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19515,
  abstract     = {Neural models learn data representations that lie on low-dimensional manifolds,
yet modeling the relation between these representational spaces is an ongoing challenge. By integrating spectral geometry principles into neural modeling, we show
that this problem can be better addressed in the functional domain, mitigating complexity, while enhancing interpretability and performances on downstream tasks.
To this end, we introduce a multi-purpose framework to the representation learning
community, which allows to: (i) compare different spaces in an interpretable way
and measure their intrinsic similarity; (ii) find correspondences between them, both
in unsupervised and weakly supervised settings, and (iii) to effectively transfer
representations between distinct spaces. We validate our framework on various
applications, ranging from stitching to retrieval tasks, and on multiple modalities,
demonstrating that Latent Functional Maps can serve as a swiss-army knife for
representation alignment},
  author       = {Fumero, Marco and Pegoraro, Marco and Maiorca, Valentino and Locatello, Francesco and Rodolà, Emanuele},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Latent functional maps: A spectral framework for representation alignment}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19517,
  abstract     = {In this paper, we present a novel data-free method for merging neural networks in weight space. Differently from most existing works, our method optimizes for the permutations of network neurons globally across all layers. This allows us to enforce cycle consistency of the permutations when merging n ≥ 3 models, allowing circular compositions of permutations to be computed without accumulating error along the path. We qualitatively and quantitatively motivate the need for such a constraint, showing its benefits when merging sets of models in scenarios spanning varying architectures and datasets. We finally show that, when coupled
with activation renormalization, our approach yields the best results in the task.},
  author       = {Crisostomi, Donato and Fumero, Marco and Baieri, Daniele and Bernard, Florian and Rodolà, Emanuele},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{C2M3: Cycle-consistent multi-model merging}},
  volume       = {37},
  year         = {2024},
}

