@inproceedings{20032,
  abstract     = {We propose Scalable Mechanistic Neural Network (S-MNN), an enhanced neural network framework designed for scientific machine learning applications involving long temporal sequences. By reformulating the original Mechanistic Neural Network (MNN) (Pervez et al., 2024), we reduce the computational time and space complexities from cubic and quadratic with respect to the sequence length, respectively, to linear. This significant improvement enables efficient modeling of long-term dynamics without sacrificing accuracy or interpretability. Extensive experiments demonstrate that S-MNN matches the original MNN in precision while substantially reducing computational resources. Consequently, S-MNN can drop-in replace the original MNN in applications, providing a practical and efficient tool for integrating mechanistic bottlenecks into neural network models of complex dynamical systems. Source code is available at https://github.com/IST-DASLab/ScalableMNN.},
  author       = {Chen, Jiale and Yao, Dingling and Pervez, Adeel A and Alistarh, Dan-Adrian and Locatello, Francesco},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {63716--63737},
  publisher    = {ICLR},
  title        = {{Scalable mechanistic neural networks}},
  year         = {2025},
}

@inproceedings{20033,
  abstract     = {A growing number of machine learning scenarios rely on knowledge distillation where one uses the output of a surrogate model as labels to supervise the training of a target model. In this work, we provide a sharp characterization of this process for ridgeless, high-dimensional regression, under two settings: (i) model shift, where the surrogate model is arbitrary, and (ii) distribution shift, where the surrogate model is the solution of empirical risk minimization with out-of-distribution data. In both cases, we characterize the precise risk of the target model through non-asymptotic bounds in terms of sample size and data distribution under mild conditions. As a consequence, we identify the form of the optimal surrogate model, which reveals the benefits and limitations of discarding weak features in a data-dependent fashion. In the context of weak-to-strong (W2S) generalization, this has the interpretation that (i) W2S training, with the surrogate as the weak model, can provably outperform training with strong labels under the same data budget, but (ii) it is unable to improve the data scaling law. We validate our results on numerical experiments both on ridgeless regression and on neural network architectures.},
  author       = {Emrullah Ildiz, M. and Gozeten, Halil Alperen and Taga, Ege Onur and Mondelli, Marco and Oymak, Samet},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {2967--3006},
  publisher    = {ICLR},
  title        = {{High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws}},
  year         = {2025},
}

@inproceedings{20034,
  abstract     = {We introduce LDAdam, a memory-efficient optimizer for training large models, that performs adaptive optimization steps within lower dimensional subspaces, while consistently exploring the full parameter space during training. This strategy keeps the optimizer's memory footprint to a fraction of the model size. LDAdam relies on a new projection-aware update rule for the optimizer states that allows for transitioning between subspaces, i.e., estimation of the statistics of the projected gradients. To mitigate the errors due to low-rank projection, LDAdam integrates a new generalized error feedback mechanism, which explicitly accounts for both gradient and optimizer state compression. We prove the convergence of LDAdam under standard assumptions, and provide empirical evidence that LDAdam allows for efficient fine-tuning and pre-training of language models.},
  author       = {Robert, Thomas and Safaryan, Mher and Modoranu, Ionut-Vlad and Alistarh, Dan-Adrian},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {101877--101913},
  publisher    = {ICLR},
  title        = {{LDAdam: Adaptive optimization from low-dimensional gradient statistics}},
  year         = {2025},
}

@inproceedings{20035,
  abstract     = {Deep neural networks (DNNs) at convergence consistently represent the training data in the last layer via a geometric structure referred to as neural collapse. This empirical evidence has spurred a line of theoretical research aimed at proving the emergence of neural collapse, mostly focusing on the unconstrained features model. Here, the features of the penultimate layer are free variables, which makes the model data-agnostic and puts into question its ability to capture DNN training. Our work addresses the issue, moving away from unconstrained features and
studying DNNs that end with at least two linear layers. We first prove generic guarantees on neural collapse that assume (i) low training error and balancedness of linear layers (for within-class variability collapse), and (ii) bounded conditioning of the features before the linear part (for orthogonality of class-means, and their alignment with weight matrices). The balancedness refers to the fact that W⊤ℓ+1Wℓ+1 ≈ WℓW⊤ℓfor any pair of consecutive weight matrices of the linear part, and the bounded conditioning requires a well-behaved ratio between largest and smallest non-zero singular values of the features. We then show that such assumptions hold for gradient descent training with weight decay: (i) for networks with a wide first layer, we prove low training error and balancedness, and (ii) for solutions that are either nearly optimal or stable under large learning rates, we additionally prove the bounded conditioning. Taken together, our results are the first to show neural collapse in the end-to-end training of DNNs.},
  author       = {Jacot, Arthur and Súkeník, Peter and Wang, Zihan and Mondelli, Marco},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {1905--1931},
  publisher    = {ICLR},
  title        = {{Wide neural networks trained with weight decay provably exhibit neural collapse}},
  year         = {2025},
}

@inproceedings{20036,
  abstract     = {We introduce NeCo: Patch Neighbor Consistency, a novel self-supervised training loss that enforces patch-level nearest neighbor consistency across a student and teacher model. Compared to contrastive approaches that only yield binary learning signals, i.e. "attract" and "repel", this approach benefits from the more fine-grained learning signal of sorting spatially dense features relative to reference patches. Our method leverages differentiable sorting applied on top of pretrained representations, such as DINOv2-registers to bootstrap the learning signal and further improve upon them. This dense post-pretraining leads to superior performance across various models and datasets, despite requiring only 19 hours on a single GPU. This method generates high-quality dense feature encoders and establishes several new state-of-the-art results such as +2.3 % and +4.2% for non-parametric in-context semantic segmentation on ADE20k and Pascal VOC, +1.6% and +4.8% for linear segmentation evaluations on COCO-Things and -Stuff and improvements in the 3D understanding of multi-view consistency on SPair-71k, by more than 1.5%.},
  author       = {Pariza, Valentinos and Salehi, Mohammadreza and Burghouts, Gertjan and Locatello, Francesco and Asano, Yuki M.},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {72303--72330},
  publisher    = {ICLR},
  title        = {{Near, far: Patch-ordering enhances vision foundation models' scene understanding}},
  year         = {2025},
}

@inproceedings{20037,
  abstract     = {Disentangling polysemantic neurons is at the core of many current approaches to interpretability of large language models. Here we attempt to study how disentanglement can be used to understand performance, particularly under weight sparsity, a leading post-training optimization technique. We suggest a novel measure for estimating neuronal entanglement: the Wasserstein distance of a neuron's output distribution to a Gaussian. Moreover, we show the existence of a small number of highly entangled "Wasserstein Neurons" in each linear layer of an LLM, characterized by their highly non-Gaussian output distributions, their role in mapping similar inputs to dissimilar outputs, and their significant impact on model accuracy. To study these phenomena, we propose a new experimental framework for disentangling polysemantic neurons. Our framework separates each layer's inputs to create a mixture of experts where each neuron's output is computed by a mixture of neurons of lower Wasserstein distance, each better at maintaining accuracy when sparsified without retraining. We provide strong evidence that this is because the mixture of sparse experts is effectively disentangling the input-output relationship of individual neurons, in particular the difficult Wasserstein neurons.},
  author       = {Sawmya, Shashata and Kong, Linghao and Markov, Ilia and Alistarh, Dan-Adrian and Shavit, Nir},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {26244--26274},
  publisher    = {ICLR},
  title        = {{Wasserstein distances, neuronal entanglement, and sparsity}},
  year         = {2025},
}

@inproceedings{20038,
  abstract     = {Pruning eliminates unnecessary parameters in neural networks; it offers a promising solution to the growing computational demands of large language models (LLMs). While many focus on post-training pruning, sparse pre-training--which combines pruning and pre-training into a single phase--provides a simpler alternative. In this work, we present the first systematic exploration of optimal sparse pre-training configurations for LLMs through an examination of 80 unique pruning schedules across different sparsity levels and training durations. We find that initiating pruning at 25% of total training compute and concluding at 75% achieves near-optimal final evaluation loss. These findings provide valuable insights for efficient and effective sparse pre-training of LLMs. Furthermore, we propose a new scaling law that modifies the Chinchilla scaling law to use the average parameter count over pre-training. Through empirical and theoretical validation, we demonstrate that this modified scaling law accurately models evaluation loss for both sparsely and densely pre-trained LLMs, unifying scaling laws across pre-training paradigms. Our findings indicate that while sparse pre-training achieves the same final model quality as dense pre-training for equivalent compute budgets, it provides substantial benefits through reduced model size, enabling significant potential computational savings during inference.},
  author       = {Jin, Tian and Humayun, Ahmed Imtiaz and Evci, Utku and Subramanian, Suvinay and Yazdanbakhsh, Amir and Alistarh, Dan-Adrian and Dziugaite, Gintare Karolina},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {85165--85181},
  publisher    = {ICLR},
  title        = {{The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws}},
  year         = {2025},
}