@inproceedings{18972,
  abstract     = {Deep learning models are known to overfit and memorize spurious features in the training dataset. While numerous empirical studies have aimed at understanding this phenomenon, a rigorous theoretical framework to quantify it is still missing. In this paper, we consider spurious features that are uncorrelated with the learning task, and we provide a precise characterization of how they are memorized via two separate terms: (i) the stability of the model with respect to individual training samples, and (ii) the feature alignment between the spurious pattern and the full sample. While the first term is well established in learning theory and it is connected to the generalization error in classical work, the second one is, to the best of our knowledge, novel. Our key technical result gives a precise characterization of the feature alignment for the two prototypical settings of random features (RF) and neural tangent kernel (NTK) regression. We prove that the memorization of spurious features weakens as the generalization capability increases and, through the analysis of the feature alignment, we unveil the role of the model and of its activation function. Numerical experiments show the predictive power of our theory on standard datasets (MNIST, CIFAR-10).},
  author       = {Bombari, Simone and Mondelli, Marco},
  booktitle    = {41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {4267--4299},
  publisher    = {ML Research Press},
  title        = {{How spurious features are memorized: Precise analysis for random and NTK features}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{18973,
  abstract     = {Understanding the reasons behind the exceptional success of transformers requires a better analysis of why attention layers are suitable for NLP tasks. In particular, such tasks require predictive models to capture contextual meaning which often depends on one or few words, even if the sentence is long. Our work studies this key property, dubbed word sensitivity (WS), in the prototypical setting of random features. We show that attention layers enjoy high WS, namely, there exists a vector in the space of embeddings that largely perturbs the random attention features map. The argument critically exploits the role of the softmax in the attention layer, highlighting its benefit compared to other activations (e.g., ReLU). In contrast, the WS of standard random features is of order 1/n−−√, n being the number of words in the textual sample, and thus it decays with the length of the context. We then translate these results on the word sensitivity into generalization bounds: due to their low WS, random features provably cannot learn to distinguish between two sentences that differ only in a single word; in contrast, due to their high WS, random attention features have higher generalization capabilities. We validate our theoretical results with experimental evidence over the BERT-Base word embeddings of the imdb review dataset.},
  author       = {Bombari, Simone and Mondelli, Marco},
  booktitle    = {41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {4300--4328},
  publisher    = {ML Research Press},
  title        = {{Towards understanding the word sensitivity of attention layers: A study via random features}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{18974,
  abstract     = {Reinforcement Learning (RL) from temporal logical specifications is a fundamental problem in sequential decision making. One of the basic and core such specification is the reachability specification that requires a target set to be eventually visited. Despite strong empirical results for RL from such specifications, the theoretical guarantees are bleak, including the impossibility of Probably Approximately Correct (PAC) guarantee for reachability specifications. Given the impossibility result, in this work we consider the problem of RL from reachability specifications along with the information of expected conditional distance (ECD). We present (a) lower bound results which establish the necessity of ECD information for PAC guarantees and (b) an algorithm that establishes PAC-guarantees given the ECD information. To the best of our knowledge, this is the first RL from reachability specifications that does not make any assumptions on the underlying environment to learn policies.},
  author       = {Svoboda, Jakub and Bansal, Suguman and Chatterjee, Krishnendu},
  booktitle    = {41st International Conference on Machine Learning},
  location     = {Vienna, Austria},
  pages        = {47331--47344},
  publisher    = {ML Research Press},
  title        = {{Reinforcement learning from reachability specifications: PAC guarantees with expected conditional distance}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{18975,
  abstract     = {Leveraging second-order information about the loss at the scale of deep networks is one of the main lines of approach for improving the performance of current optimizers for deep learning. Yet, existing approaches for accurate full-matrix preconditioning, such as Full-Matrix Adagrad (GGT) or Matrix-Free Approximate Curvature (M-FAC) suffer from massive storage costs when applied even to small-scale models, as they must store a sliding window of gradients, whose memory requirements are multiplicative in the model dimension. In this paper, we address this issue via a novel and efficient error-feedback technique that can be applied to compress preconditioners by up to two orders of magnitude in practice, without loss of convergence. Specifically, our approach compresses the gradient information via sparsification or low-rank compression before it is fed into the preconditioner, feeding the compression error back into future iterations. Extensive experiments on deep neural networks show that this approach can compress full-matrix preconditioners to up to 99% sparsity without accuracy loss, effectively removing the memory overhead of fullmatrix preconditioners such as GGT and M-FAC.},
  author       = {Modoranu, Ionut-Vlad and Kalinov, Aleksei and Kurtic, Eldar and Frantar, Elias and Alistarh, Dan-Adrian},
  booktitle    = {41st International Conference on Machine Learning},
  issn         = {2640-3498},
  location     = {Vienna, Austria},
  pages        = {35910--35933},
  publisher    = {ML Research Press},
  title        = {{Error feedback can accurately compress preconditioners}},
  volume       = {235},
  year         = {2024},
}

@inproceedings{18976,
  abstract     = {We analyze asynchronous-type algorithms for distributed SGD in the heterogeneous setting, where each worker has its own computation and communication speeds, as well as data distribution. In these algorithms, workers compute possibly stale and stochastic gradients associated with their local data at some iteration back in history and then return those gradients to the server without synchronizing with other workers. We present a unified convergence theory for non-convex smooth functions in the heterogeneous regime. The proposed analysis provides convergence for pure asynchronous SGD and its various modifications. Moreover, our theory explains what affects the convergence rate and what can be done to improve the performance of asynchronous algorithms. In particular, we introduce a novel asynchronous method based on worker shuffling. As a by-product of our analysis, we also demonstrate convergence guarantees for gradient-type algorithms such as SGD with random reshuffling and shuffle-once mini-batch SGD. The derived rates match the best-known results for those algorithms, highlighting the tightness of our approach. Finally, our numerical evaluations support theoretical findings and show the good practical performance of our method.},
  author       = {Islamov, Rustem and Safaryan, Mher and Alistarh, Dan-Adrian},
  booktitle    = {Proceedings of The 27th International Conference on Artificial Intelligence and Statistics},
  issn         = {2640-3498},
  location     = {Valencia, Spain},
  pages        = {649--657},
  publisher    = {ML Research Press},
  title        = {{AsGrad: A sharp unified analysis of asynchronous-SGD algorithms}},
  volume       = {238},
  year         = {2024},
}

@inproceedings{18977,
  abstract     = {Recent advances in large language model (LLM) pretraining have led to high-quality LLMs with impressive abilities. By compressing such LLMs via quantization to 3-4 bits per parameter, they can fit into memory-limited devices such as laptops and mobile phones, enabling personalized use. Quantizing models to 3-4 bits per parameter can lead to moderate to high accuracy losses, especially for smaller models (1-10B parameters), which are suitable for edge deployment. To address this accuracy issue, we introduce the Sparse-Quantized Representation (SpQR), a new compressed format and quantization technique that enables for the first time \emph{near-lossless} compression of LLMs across model scales while reaching similar compression levels to previous methods. SpQR works by identifying and isolating \emph{outlier weights}, which cause particularly large quantization errors, and storing them in higher precision while compressing all other weights to 3-4 bits, and achieves relative accuracy losses of less than 
 in perplexity for highly-accurate LLaMA and Falcon LLMs. This makes it possible to run a 33B parameter LLM on a single 24 GB consumer GPU without performance degradation at 15% speedup, thus making powerful LLMs available to consumers without any downsides. SpQR comes with efficient algorithms for both encoding weights into its format, as well as decoding them efficiently at runtime. Specifically, we provide an efficient GPU inference algorithm for SpQR, which yields faster inference than 16-bit baselines at similar accuracy while enabling memory compression gains of more than 4x.},
  author       = {Dettmers, Tim and Svirschevski, Ruslan A. and Egiazarian, Vage and Kuznedelev, Denis and Frantar, Elias and Ashkboos, Saleh and Borzunov, Alexander and Hoefler, Torsten and Alistarh, Dan-Adrian},
  booktitle    = {12th International Conference on Learning Representations},
  location     = {Vienna, Austria},
  publisher    = {OpenReview},
  title        = {{SpQR: A sparse-quantized representation for near-lossless LLM weight compression}},
  year         = {2024},
}

@inproceedings{18996,
  abstract     = {We consider the linear causal representation learning setting where we observe a linear mixing of d unknown latent factors, which follow a linear structural causal model. Recent work has shown that it is possible to recover the latent factors as well as the underlying structural causal model over them, up to permutation and scaling, provided that we have at least d environments, each of which corresponds to perfect interventions on a single latent node (factor). After this powerful result, a key open problem faced by the community has been to relax these conditions: allow for coarser than perfect single-node interventions, and allow for fewer than d of them, since the number of latent factors d could be very large. In this work, we consider precisely such a setting, where we allow a smaller than d number of environments, and also allow for very coarse interventions that can very coarsely \textit{change the entire causal graph over the latent factors}. On the flip side, we relax what we wish to extract to simply the \textit{list of nodes that have shifted between one or more environments}. We provide a surprising identifiability result that it is indeed possible, under some very mild standard assumptions, to identify the set of shifted nodes. Our identifiability proof moreover is a constructive one: we explicitly provide necessary and sufficient conditions for a node to be a shifted node, and show that we can check these conditions given observed data. Our algorithm lends itself very naturally to the sample setting where instead of just interventional distributions, we are provided datasets of samples from each of these distributions. We corroborate our results on both synthetic experiments as well as an interesting psychometric dataset. The code can be found at https://github.com/TianyuCodings/iLCS.},
  author       = {Chen, Tianyu and Bello, Kevin and Locatello, Francesco and Aragam, Bryon and Ravikumar, Pradeep Kumar},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Identifying general mechanism shifts in linear causal representations}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{18998,
  abstract     = {Word embeddings represent language vocabularies as clouds of d-dimensional points. We investigate how information is conveyed by the general shape of these clouds, instead of representing the semantic meaning of each token. Specifically, we use the notion of persistent homology from topological data analysis (TDA) to measure the distances between language pairs from the shape of their unlabeled embeddings. These distances quantify the degree of non-isometry of the embeddings. To distinguish whether these differences are random training errors or capture real information about the languages, we use the computed distance matrices to construct language phylogenetic trees over 81 Indo-European languages. Careful evaluation shows that our reconstructed trees exhibit strong and statistically-significant similarities to the reference.},
  author       = {Draganov, Ondrej and Skiena, Steven},
  booktitle    = {Findings of the Association for Computational Linguistics: EMNLP 2024},
  location     = {Miami, FL, United States},
  pages        = {12080--12099},
  publisher    = {Association for Computational Linguistics},
  title        = {{The shape of word embeddings: Quantifying non-isometry with topological data analysis}},
  doi          = {10.18653/v1/2024.findings-emnlp.705},
  year         = {2024},
}

@unpublished{18999,
  abstract     = {Exploring the shape of point configurations has been a key driver in the evolution of TDA (short for topological data analysis) since its infancy. This survey illustrates the recent efforts to broaden these ideas to model spatial interactions among multiple configurations, each distinguished by a color. It describes advances in this area and prepares the ground for further exploration by mentioning unresolved questions and promising research avenues while focusing on the overlap with discrete geometry.},
  author       = {Cultrera di Montesano, Sebastiano and Draganov, Ondrej and Edelsbrunner, Herbert and Saghafian, Morteza},
  booktitle    = {arXiv},
  title        = {{Chromatic topological data analysis}},
  doi          = {10.48550/ARXIV.2406.04102},
  year         = {2024},
}

@inproceedings{19005,
  abstract     = {Causal representation learning promises to extend causal models to hidden causal
variables from raw entangled measurements. However, most progress has focused
on proving identifiability results in different settings, and we are not aware of any
successful real-world application. At the same time, the field of dynamical systems
benefited from deep learning and scaled to countless applications but does not allow
parameter identification. In this paper, we draw a clear connection between the two
and their key assumptions, allowing us to apply identifiable methods developed
in causal representation learning to dynamical systems. At the same time, we can
leverage scalable differentiable solvers developed for differential equations to build
models that are both identifiable and practical. Overall, we learn explicitly controllable models that isolate the trajectory-specific parameters for further downstream
tasks such as out-of-distribution classification or treatment effect estimation. We
experiment with a wind simulator with partially known factors of variation. We
also apply the resulting model to real-world climate data and successfully answer
downstream causal questions in line with existing literature on climate change.
Code is available at https://github.com/CausalLearningAI/crl-dynamical-systems.},
  author       = {Yao, Dingling and Muller, Caroline J and Locatello, Francesco},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Marrying causal representation learning with dynamical systems for science}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19007,
  abstract     = {Learning modular object-centric representations is crucial for systematic generalization. Existing methods show promising object-binding capabilities empirically,
but theoretical identifiability guarantees remain relatively underdeveloped. Understanding when object-centric representations can theoretically be identified is
crucial for scaling slot-based methods to high-dimensional images with correctness
guarantees. To that end, we propose a probabilistic slot-attention algorithm that
imposes an aggregate mixture prior over object-centric slot representations, thereby
providing slot identifiability guarantees without supervision, up to an equivalence
relation. We provide empirical verification of our theoretical identifiability result
using both simple 2-dimensional data and high-resolution imaging datasets.
},
  author       = {Kori, Avinash and Locatello, Francesco and Santhirasekaram, Ainkaran and Toni, Francesca and Glocker, Ben and De Sousa Ribeiro, Fabio},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Identifiable object-centric representation learning via probabilistic slot attention}},
  volume       = {37},
  year         = {2024},
}

@unpublished{19013,
  abstract     = {We study the singularities of the moduli space of degree e maps from smooth genus g curves to an arbitrary smooth hypersurface of low degree. For e large compared to g, we show that these moduli spaces have at worst terminal singularities. Our main approach is to study the jet schemes of these moduli spaces by developing a suitable form of the circle method.},
  author       = {Glas, Jakob and Hase-Liu, Matthew },
  booktitle    = {arXiv},
  title        = {{Terminal singularities of the moduli space of curves on low degree hypersurfaces and the circle method}},
  doi          = {10.48550/arXiv.2412.14923},
  year         = {2024},
}

@article{19051,
  abstract     = {This paper corrects an error in an earlier work of the author.},
  author       = {Browning, Timothy D},
  issn         = {1687-0247},
  journal      = {International Mathematics Research Notices},
  number       = {13},
  pages        = {10165--10168},
  publisher    = {Oxford University Press},
  title        = {{The polynomial sieve and equal sums of like polynomials}},
  doi          = {10.1093/imrn/rnae066},
  volume       = {2024},
  year         = {2024},
}

@unpublished{19063,
  abstract     = {Instruction-tuned Large Language Models (LLMs) show impressive results in numerous practical applications, but they lack essential safety features that are common in other areas of computer science, particularly an explicit separation of instructions and data. This makes them vulnerable to manipulations such as indirect prompt injections and generally unsuitable for safety-critical tasks. Surprisingly, there is currently no established definition or benchmark to quantify this phenomenon. In this work, we close this gap by introducing a formal measure for instruction-data separation and an empirical variant that is calculable from a model's outputs. We also present a new dataset, SEP, that allows estimating the measure for real-world models. Our results on various LLMs show that the problem of instruction-data separation is real: all models fail to achieve high separation, and canonical mitigation techniques, such as prompt engineering and fine-tuning, either fail to substantially improve separation or reduce model utility. The source code and SEP dataset are openly accessible at https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.
},
  author       = {Zverev, Egor and Abdelnabi, Sahar and Tabesh, Soroush and Fritz, Mario and Lampert, Christoph},
  booktitle    = {arXiv},
  title        = {{Can LLMs separate instructions from data? And what do we even mean by that?}},
  doi          = {10.48550/arXiv.2403.06833},
  year         = {2024},
}

@misc{19307,
  abstract     = {This repository contains the data, scripts, SAM codes and files required to reproduce the results of the manuscript "The Unreasonable Efficiency of Total Rain Evaporation Removal in Triggering Convective Self-Aggregation" submitted to the Geophysical Research Letters (GRL).

Brief description of project: This project aims to examine the impact of rain evaporation removal or reduction in the planetary boundary layer (PBL) on convective self aggregation (CSA). Non-rotating radiative-convective equilibrium (RCE) simulations were conducted with the System for Atmospheric Modeling (SAM) cloud resolving model. Rain evaporation in the lowest 1 km was progressively reduced and the effect on CSA was investigated. The physical processes underlying this type of aggregation (referred to in the manuscript as no-evaporation CSA, or NE-CSA) were analyzed and described. 
The default SAM code base (version 6.10.8) can be downloaded from here: http://rossby.msrc.sunysb.edu/~marat/SAM.html},
  author       = {Hwong, Yi-Ling and Muller, Caroline J},
  publisher    = {Zenodo},
  title        = {{Data - The unreasonable efficiency of total rain evaporation removal in triggering convective self-aggregation}},
  doi          = {10.5281/ZENODO.10687169},
  year         = {2024},
}

@article{19408,
  abstract     = {Continual learning is a subfield of machine learning, which aims to allow machine learning models to continuously learn on new data, by accumulating knowledge without forgetting what was learned in the past. In this work, we take a step back, and ask: "Why should one care about continual learning in the first place?". We set the stage by examining recent continual learning papers published at four major machine learning conferences, and show that memory-constrained settings dominate the field. Then, we discuss five open problems in machine learning, and even though they might seem unrelated to continual learning at first sight, we show that continual learning will inevitably be part of their solution. These problems are model editing, personalization and specialization, on-device learning, faster (re-)training and reinforcement learning. Finally, by comparing the desiderata from these unsolved problems and the current assumptions in continual learning, we highlight and discuss four future directions for continual learning research. We hope that this work offers an interesting perspective on the future of continual learning, while displaying its potential value and the paths we have to pursue in order to make it successful. This work is the result of the many discussions the authors had at the Dagstuhl seminar on Deep Continual Learning, in March 2023.},
  author       = {Verwimp, Eli and Aljundi, Rahaf and Ben-David, Shai and Bethge, Matthias and Cossu, Andrea and Gepperth, Alexander and Hayes, Tyler L. and Hüllermeier, Eyke and Kanan, Christopher and Kudithipudi, Dhireesha and Lampert, Christoph and Mundt, Martin and Pascanu, Razvan and Popescu, Adrian and Tolias, Andreas S. and Van De Weijer, Joost and Liu, Bing and Lomonaco, Vincenzo and Tuytelaars, Tinne and Van De Ven, Gido M.},
  issn         = {2835-8856},
  journal      = {Transactions on Machine Learning Research},
  publisher    = {Transactions on Machine Learning Research},
  title        = {{Continual learning: Applications and the road forward}},
  volume       = {2024},
  year         = {2024},
}

@article{19446,
  abstract     = {This Comment explores new approaches to enrich large-scale population data, including incorporating macro-environmental and digital health measures.},
  author       = {Nees, Frauke and Renner, Paul and Holz, Nathalie E. and Polemiti, Elli and Siehl, Sebastian and Hese, Sören and Schepanski, Kerstin and Schumann, Gunter and Walter, Henrik and Heinz, Andreas and Ralser, Markus and Twardziok, Sven and Vaidya, Nilakshi and Bernas, Antoine and Serin, Emin and Jentsch, Marcel and Hitchen, Esther and Kebir, Hedi and Lett, Tristram A. and Roy, Jean Charles and Eils, Roland and Taron, Ulrike Helene and Schütz, Tatjana and Banks, Jamie and Banaschewski, Tobias and Jansone, Karina and Christmann, Nina and Meyer-Lindenberg, Andreas and Tost, Heike and Holz, Nathalie and Schwarz, Emanuel and Stringaris, Argyris and Neidhart, Maja and Seefried, Beke and Aden, Rieke and Andreassen, Ole A. and Westlye, Lars T. and Van Der Meer, Dennis and Fernandez, Sara and Kjelkenes, Rikka and Ask, Helga and Rapp, Michael and Tschorn, Mira and Böttger, Sarah Jane and Marquand, Andre and Novarino, Gaia and Marr, Lena and Slater, Mel and Viapiana, Guillem Feixas and Orosa, Francisco Eiroa and Gallego, Jaime and Pastor, Alvaro and Forstner, Andreas J. and Hoffmann, Per and Nöthen, Markus M. and Claus, Isabelle and Miller, Abigail and Mathey, Carina M. and Heilmann-Heimbach, Stefanie and Sommer, Peter and Patraskaki, Myrto and Wilbertz, Johannes and Schmitt, Karen and Jirsa, Viktor and Petkoski, Spase and Pitel, Séverine and Otten, Lisa and Athanasiadis, Anastasios Polykarpos and Pearmund, Charlie and Spanlang, Bernhard and Alvarez, Elena and Sanchez, Mavi and Giner, Arantxa and Jia, Tianye and Gong, Yanting and Xia, Yunman and Chang, Xiao and Calhoun, Vince and Liu, Jingyu and Schwalber, Ameli and Thompson, Paul and Clinton, Nicholas and Desrivières, Sylvane and Young, Allan H. and Stahl, Bernd and Ogoh, George},
  issn         = {2731-6076},
  journal      = {Nature Mental Health},
  number       = {10},
  pages        = {1124--1127},
  publisher    = {Springer Nature},
  title        = {{Large-scale population data enrichment in mental health research}},
  doi          = {10.1038/s44220-024-00316-z},
  volume       = {2},
  year         = {2024},
}

@inproceedings{19510,
  abstract     = {We propose a new variant of the Adam optimizer [Kingma and Ba, 2014] called
MICROADAM that specifically minimizes memory overheads, while maintaining
theoretical convergence guarantees. We achieve this by compressing the gradient
information before it is fed into the optimizer state, thereby reducing its memory
footprint significantly. We control the resulting compression error via a novel
instance of the classical error feedback mechanism from distributed optimization [Seide et al., 2014, Alistarh et al., 2018, Karimireddy et al., 2019] in which
the error correction information is itself compressed to allow for practical memory
gains. We prove that the resulting approach maintains theoretical convergence
guarantees competitive to those of AMSGrad, while providing good practical performance. Specifically, we show that MICROADAM can be implemented efficiently
on GPUs: on both million-scale (BERT) and billion-scale (LLaMA) models, MICROADAM provides practical convergence competitive to that of the uncompressed
Adam baseline, with lower memory usage and similar running time. Our code is
available at https://github.com/IST-DASLab/MicroAdam.},
  author       = {Modoranu, Ionut-Vlad and Safaryan, Mher and Malinovsky, Grigory and Kurtic, Eldar and Robert, Thomas and Richtárik, Peter and Alistarh, Dan-Adrian},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{MICROADAM: Accurate adaptive optimization with low space overhead and provable convergence}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19511,
  abstract     = {We introduce QuaRot, a new Quantization scheme based on Rotations, which is able to quantize LLMs end-to-end, including all weights, activations, and KV cache in 4 bits. QuaRot rotates LLMs in a way that removes outliers from the hidden state without changing the output, making quantization easier. This computational invariance is applied to the hidden state (residual) of the LLM, as well as to the activations of the feed-forward components, aspects of the attention mechanism, and to the KV cache. The result is a quantized model where all matrix multiplications are performed in 4 bits, without any channels identified for retention in higher precision. Our 4-bit quantized LLAMA2-70B model has losses of at most 0.47 WikiText-2 perplexity and retains 99% of the zero-shot performance. We also show that QuaRot can provide lossless 6 and 8 bit LLAMA-2 models without any calibration data using round-to-nearest quantization. Code is available at github.com/spcl/QuaRot.},
  author       = {Ashkboos, Saleh and Mohtashami, Amirkeivan and Croci, Maximilian L. and Li, Bo and Cameron, Pashmina and Jaggi, Martin and Alistarh, Dan-Adrian and Hoefler, Torsten and Hensman, James},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{QuaRot: Outlier-free 4-bit inference in rotated LLMs}},
  volume       = {37},
  year         = {2024},
}

@inproceedings{19512,
  abstract     = {Differential privacy with gradual expiration models the setting where data items
arrive in a stream and at a given time t the privacy loss guaranteed for a data item
seen at time (t − d) is εg(d), where g is a monotonically non-decreasing function.
We study the fundamental continual (binary) counting problem where each data
item consists of a bit, and the algorithm needs to output at each time step the sum of
all the bits streamed so far. For a stream of length T and privacy without expiration
continual counting is possible with maximum (over all time steps) additive error
O(log2
(T)/ε) and the best known lower bound is Ω(log(T)/ε); closing this gap
is a challenging open problem.
We show that the situation is very different for privacy with gradual expiration by
giving upper and lower bounds for a large set of expiration functions g. Specifically,
our algorithm achieves an additive error of O(log(T)/ε) for a large set of privacy
expiration functions. We also give a lower bound that shows that if C is the additive
error of any ε-DP algorithm for this problem, then the product of C and the privacy
expiration function after 2C steps must be Ω(log(T)/ε). Our algorithm matches
this lower bound as its additive error is O(log(T)/ε), even when g(2C) = O(1).
Our empirical evaluation shows that we achieve a slowly growing privacy loss
with significantly smaller empirical privacy loss for large values of d than a natural
baseline algorithm.},
  author       = {Andersson, Joel Daniel and Henzinger, Monika H and Pagh, Rasmus and Steiner, Teresa Anna and Upadhyay, Jalaj},
  booktitle    = {38th Conference on Neural Information Processing Systems},
  issn         = {1049-5258},
  location     = {Vancouver, Canada},
  publisher    = {Neural Information Processing Systems Foundation},
  title        = {{Continual counting with gradual privacy expiration}},
  volume       = {37},
  year         = {2024},
}