[{"year":"2025","publication":"13th International Conference on Learning Representations","language":[{"iso":"eng"}],"page":"63716-63737","file":[{"file_name":"2025_ICLR_Chen.pdf","success":1,"file_size":732745,"checksum":"64cfdb12ae3e4e8ba57b1403e1066776","access_level":"open_access","content_type":"application/pdf","date_created":"2025-07-22T07:58:22Z","date_updated":"2025-07-22T07:58:22Z","creator":"dernst","file_id":"20065","relation":"main_file"}],"date_published":"2025-04-01T00:00:00Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"full_name":"Chen, Jiale","orcid":"0000-0001-5337-5875","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","last_name":"Chen","first_name":"Jiale"},{"full_name":"Yao, Dingling","id":"d3e02e50-48a8-11ee-8f62-c108061797fa","last_name":"Yao","first_name":"Dingling"},{"first_name":"Adeel A","last_name":"Pervez","id":"fca6d90c-d47f-11ee-bc87-93ff51604981","full_name":"Pervez, Adeel A"},{"full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","last_name":"Alistarh","first_name":"Dan-Adrian"},{"id":"26cfd52f-2483-11ee-8040-88983bcc06d4","last_name":"Locatello","orcid":"0000-0002-4850-0683","full_name":"Locatello, Francesco","first_name":"Francesco"}],"external_id":{"arxiv":["2410.06074"]},"status":"public","article_processing_charge":"No","conference":{"start_date":"2025-04-24","location":"Singapore, Singapore","name":"ICLR: International Conference on Learning Representations","end_date":"2025-04-28"},"has_accepted_license":"1","license":"https://creativecommons.org/licenses/by/4.0/","ddc":["000"],"abstract":[{"text":"We propose Scalable Mechanistic Neural Network (S-MNN), an enhanced neural network framework designed for scientific machine learning applications involving long temporal sequences. By reformulating the original Mechanistic Neural Network (MNN) (Pervez et al., 2024), we reduce the computational time and space complexities from cubic and quadratic with respect to the sequence length, respectively, to linear. This significant improvement enables efficient modeling of long-term dynamics without sacrificing accuracy or interpretability. Extensive experiments demonstrate that S-MNN matches the original MNN in precision while substantially reducing computational resources. Consequently, S-MNN can drop-in replace the original MNN in applications, providing a practical and efficient tool for integrating mechanistic bottlenecks into neural network models of complex dynamical systems. Source code is available at https://github.com/IST-DASLab/ScalableMNN.","lang":"eng"}],"file_date_updated":"2025-07-22T07:58:22Z","oa":1,"related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/ScalableMNN"}]},"scopus_import":"1","quality_controlled":"1","citation":{"ista":"Chen J, Yao D, Pervez AA, Alistarh D-A, Locatello F. 2025. Scalable mechanistic neural networks. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 63716–63737.","mla":"Chen, Jiale, et al. “Scalable Mechanistic Neural Networks.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 63716–37.","apa":"Chen, J., Yao, D., Pervez, A. A., Alistarh, D.-A., &#38; Locatello, F. (2025). Scalable mechanistic neural networks. In <i>13th International Conference on Learning Representations</i> (pp. 63716–63737). Singapore, Singapore: ICLR.","chicago":"Chen, Jiale, Dingling Yao, Adeel A Pervez, Dan-Adrian Alistarh, and Francesco Locatello. “Scalable Mechanistic Neural Networks.” In <i>13th International Conference on Learning Representations</i>, 63716–37. ICLR, 2025.","ama":"Chen J, Yao D, Pervez AA, Alistarh D-A, Locatello F. Scalable mechanistic neural networks. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:63716-63737.","ieee":"J. Chen, D. Yao, A. A. Pervez, D.-A. Alistarh, and F. Locatello, “Scalable mechanistic neural networks,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 63716–63737.","short":"J. Chen, D. Yao, A.A. Pervez, D.-A. Alistarh, F. Locatello, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 63716–63737."},"date_created":"2025-07-20T22:02:01Z","corr_author":"1","day":"01","publication_status":"published","publisher":"ICLR","type":"conference","oa_version":"Published Version","arxiv":1,"month":"04","OA_type":"diamond","date_updated":"2025-08-04T08:03:11Z","title":"Scalable mechanistic neural networks","tmp":{"image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"department":[{"_id":"DaAl"},{"_id":"FrLo"}],"_id":"20032","OA_place":"publisher","publication_identifier":{"isbn":["9798331320850"]}},{"file_date_updated":"2025-08-04T08:32:38Z","oa":1,"ddc":["000"],"abstract":[{"text":"A growing number of machine learning scenarios rely on knowledge distillation where one uses the output of a surrogate model as labels to supervise the training of a target model. In this work, we provide a sharp characterization of this process for ridgeless, high-dimensional regression, under two settings: (i) model shift, where the surrogate model is arbitrary, and (ii) distribution shift, where the surrogate model is the solution of empirical risk minimization with out-of-distribution data. In both cases, we characterize the precise risk of the target model through non-asymptotic bounds in terms of sample size and data distribution under mild conditions. As a consequence, we identify the form of the optimal surrogate model, which reveals the benefits and limitations of discarding weak features in a data-dependent fashion. In the context of weak-to-strong (W2S) generalization, this has the interpretation that (i) W2S training, with the surrogate as the weak model, can provably outperform training with strong labels under the same data budget, but (ii) it is unable to improve the data scaling law. We validate our results on numerical experiments both on ridgeless regression and on neural network architectures.","lang":"eng"}],"has_accepted_license":"1","article_processing_charge":"No","conference":{"start_date":"2025-04-24","end_date":"2025-04-28","name":"ICLR: International Conference on Learning Representations","location":"Singapore, Singapore"},"day":"01","scopus_import":"1","date_created":"2025-07-20T22:02:02Z","citation":{"ieee":"M. Emrullah Ildiz, H. A. Gozeten, E. O. Taga, M. Mondelli, and S. Oymak, “High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 2967–3006.","short":"M. Emrullah Ildiz, H.A. Gozeten, E.O. Taga, M. Mondelli, S. Oymak, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 2967–3006.","ama":"Emrullah Ildiz M, Gozeten HA, Taga EO, Mondelli M, Oymak S. High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:2967-3006.","ista":"Emrullah Ildiz M, Gozeten HA, Taga EO, Mondelli M, Oymak S. 2025. High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 2967–3006.","apa":"Emrullah Ildiz, M., Gozeten, H. A., Taga, E. O., Mondelli, M., &#38; Oymak, S. (2025). High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws. In <i>13th International Conference on Learning Representations</i> (pp. 2967–3006). Singapore, Singapore: ICLR.","chicago":"Emrullah Ildiz, M., Halil Alperen Gozeten, Ege Onur Taga, Marco Mondelli, and Samet Oymak. “High-Dimensional Analysis of Knowledge Distillation: Weak-to-Strong Generalization and Scaling Laws.” In <i>13th International Conference on Learning Representations</i>, 2967–3006. ICLR, 2025.","mla":"Emrullah Ildiz, M., et al. “High-Dimensional Analysis of Knowledge Distillation: Weak-to-Strong Generalization and Scaling Laws.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 2967–3006."},"quality_controlled":"1","language":[{"iso":"eng"}],"publication":"13th International Conference on Learning Representations","page":"2967-3006","year":"2025","status":"public","acknowledgement":"M.E.I., H.A.G., E.O.T., S.O. are supported by the NSF grants CCF-2046816, CCF-2403075, the Office of Naval Research grant N000142412289, an OpenAI Agentic AI Systems grant, and gifts by Open Philanthropy and Google Research. M. M. is funded by the European Union (ERC, INF2, project number 101161364). Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Research Council Executive Agency. Neither the European Union nor the granting authority can be held responsible for them.","external_id":{"arxiv":["2410.18837"]},"author":[{"last_name":"Emrullah Ildiz","full_name":"Emrullah Ildiz, M.","first_name":"M."},{"last_name":"Gozeten","full_name":"Gozeten, Halil Alperen","first_name":"Halil Alperen"},{"last_name":"Taga","full_name":"Taga, Ege Onur","first_name":"Ege Onur"},{"first_name":"Marco","last_name":"Mondelli","id":"27EB676C-8706-11E9-9510-7717E6697425","full_name":"Mondelli, Marco","orcid":"0000-0002-3242-7020"},{"last_name":"Oymak","full_name":"Oymak, Samet","first_name":"Samet"}],"file":[{"date_updated":"2025-08-04T08:32:38Z","creator":"dernst","file_id":"20112","relation":"main_file","file_name":"2025_ICLR_Ildiz.pdf","success":1,"file_size":528171,"checksum":"5a38b093ebb4ee4eb662ea142621a5ca","content_type":"application/pdf","access_level":"open_access","date_created":"2025-08-04T08:32:38Z"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_published":"2025-04-01T00:00:00Z","tmp":{"image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws","department":[{"_id":"MaMo"}],"date_updated":"2025-08-04T08:33:58Z","publication_identifier":{"isbn":["9798331320850"]},"OA_place":"publisher","_id":"20033","arxiv":1,"oa_version":"Published Version","publisher":"ICLR","type":"conference","publication_status":"published","OA_type":"diamond","project":[{"_id":"911e6d1f-16d5-11f0-9cad-c5c68c6a1cdf","name":"Inference in High Dimensions: Light-speed Algorithms and Information Limits","grant_number":"101161364"}],"month":"04"},{"date_updated":"2025-08-04T08:41:10Z","department":[{"_id":"DaAl"}],"tmp":{"image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"LDAdam: Adaptive optimization from low-dimensional gradient statistics","_id":"20034","publication_identifier":{"isbn":["9798331320850"]},"OA_place":"publisher","oa_version":"Published Version","publisher":"ICLR","publication_status":"published","type":"conference","arxiv":1,"month":"04","OA_type":"diamond","has_accepted_license":"1","article_processing_charge":"No","conference":{"start_date":"2025-04-24","name":"ICLR: International Conference on Learning Representations","location":"Singapore, Singapore","end_date":"2025-04-28"},"file_date_updated":"2025-08-04T08:39:51Z","oa":1,"abstract":[{"lang":"eng","text":"We introduce LDAdam, a memory-efficient optimizer for training large models, that performs adaptive optimization steps within lower dimensional subspaces, while consistently exploring the full parameter space during training. This strategy keeps the optimizer's memory footprint to a fraction of the model size. LDAdam relies on a new projection-aware update rule for the optimizer states that allows for transitioning between subspaces, i.e., estimation of the statistics of the projected gradients. To mitigate the errors due to low-rank projection, LDAdam integrates a new generalized error feedback mechanism, which explicitly accounts for both gradient and optimizer state compression. We prove the convergence of LDAdam under standard assumptions, and provide empirical evidence that LDAdam allows for efficient fine-tuning and pre-training of language models."}],"ddc":["000"],"related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/LDAdam"}]},"corr_author":"1","day":"01","scopus_import":"1","date_created":"2025-07-20T22:02:02Z","quality_controlled":"1","citation":{"short":"T. Robert, M. Safaryan, I.-V. Modoranu, D.-A. Alistarh, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 101877–101913.","ieee":"T. Robert, M. Safaryan, I.-V. Modoranu, and D.-A. Alistarh, “LDAdam: Adaptive optimization from low-dimensional gradient statistics,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 101877–101913.","ama":"Robert T, Safaryan M, Modoranu I-V, Alistarh D-A. LDAdam: Adaptive optimization from low-dimensional gradient statistics. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:101877-101913.","ista":"Robert T, Safaryan M, Modoranu I-V, Alistarh D-A. 2025. LDAdam: Adaptive optimization from low-dimensional gradient statistics. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 101877–101913.","apa":"Robert, T., Safaryan, M., Modoranu, I.-V., &#38; Alistarh, D.-A. (2025). LDAdam: Adaptive optimization from low-dimensional gradient statistics. In <i>13th International Conference on Learning Representations</i> (pp. 101877–101913). Singapore, Singapore: ICLR.","mla":"Robert, Thomas, et al. “LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 101877–913.","chicago":"Robert, Thomas, Mher Safaryan, Ionut-Vlad Modoranu, and Dan-Adrian Alistarh. “LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics.” In <i>13th International Conference on Learning Representations</i>, 101877–913. ICLR, 2025."},"year":"2025","language":[{"iso":"eng"}],"publication":"13th International Conference on Learning Representations","page":"101877-101913","author":[{"full_name":"Robert, Thomas","last_name":"Robert","first_name":"Thomas"},{"full_name":"Safaryan, Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","last_name":"Safaryan","first_name":"Mher"},{"first_name":"Ionut-Vlad","last_name":"Modoranu","id":"449f7a18-f128-11eb-9611-9b430c0c6333","full_name":"Modoranu, Ionut-Vlad"},{"first_name":"Dan-Adrian","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"external_id":{"arxiv":["2410.16103"]},"date_published":"2025-04-01T00:00:00Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","file":[{"date_updated":"2025-08-04T08:39:51Z","relation":"main_file","file_id":"20113","creator":"dernst","file_size":1346111,"success":1,"file_name":"2025_ICLR_Robert.pdf","date_created":"2025-08-04T08:39:51Z","content_type":"application/pdf","access_level":"open_access","checksum":"9327d82569358d7bf1c3ec1a9952e721"}],"status":"public"},{"day":"01","corr_author":"1","citation":{"ista":"Jacot A, Súkeník P, Wang Z, Mondelli M. 2025. Wide neural networks trained with weight decay provably exhibit neural collapse. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 1905–1931.","apa":"Jacot, A., Súkeník, P., Wang, Z., &#38; Mondelli, M. (2025). Wide neural networks trained with weight decay provably exhibit neural collapse. In <i>13th International Conference on Learning Representations</i> (pp. 1905–1931). Singapore, Singapore: ICLR.","mla":"Jacot, Arthur, et al. “Wide Neural Networks Trained with Weight Decay Provably Exhibit Neural Collapse.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 1905–31.","chicago":"Jacot, Arthur, Peter Súkeník, Zihan Wang, and Marco Mondelli. “Wide Neural Networks Trained with Weight Decay Provably Exhibit Neural Collapse.” In <i>13th International Conference on Learning Representations</i>, 1905–31. ICLR, 2025.","ieee":"A. Jacot, P. Súkeník, Z. Wang, and M. Mondelli, “Wide neural networks trained with weight decay provably exhibit neural collapse,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 1905–1931.","short":"A. Jacot, P. Súkeník, Z. Wang, M. Mondelli, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 1905–1931.","ama":"Jacot A, Súkeník P, Wang Z, Mondelli M. Wide neural networks trained with weight decay provably exhibit neural collapse. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:1905-1931."},"quality_controlled":"1","date_created":"2025-07-20T22:02:02Z","scopus_import":"1","oa":1,"file_date_updated":"2025-08-04T08:45:43Z","ddc":["000"],"abstract":[{"lang":"eng","text":"Deep neural networks (DNNs) at convergence consistently represent the training data in the last layer via a geometric structure referred to as neural collapse. This empirical evidence has spurred a line of theoretical research aimed at proving the emergence of neural collapse, mostly focusing on the unconstrained features model. Here, the features of the penultimate layer are free variables, which makes the model data-agnostic and puts into question its ability to capture DNN training. Our work addresses the issue, moving away from unconstrained features and\r\nstudying DNNs that end with at least two linear layers. We first prove generic guarantees on neural collapse that assume (i) low training error and balancedness of linear layers (for within-class variability collapse), and (ii) bounded conditioning of the features before the linear part (for orthogonality of class-means, and their alignment with weight matrices). The balancedness refers to the fact that W⊤ℓ+1Wℓ+1 ≈ WℓW⊤ℓfor any pair of consecutive weight matrices of the linear part, and the bounded conditioning requires a well-behaved ratio between largest and smallest non-zero singular values of the features. We then show that such assumptions hold for gradient descent training with weight decay: (i) for networks with a wide first layer, we prove low training error and balancedness, and (ii) for solutions that are either nearly optimal or stable under large learning rates, we additionally prove the bounded conditioning. Taken together, our results are the first to show neural collapse in the end-to-end training of DNNs."}],"has_accepted_license":"1","conference":{"name":"ICLR: International Conference on Learning Representations","location":"Singapore, Singapore","end_date":"2025-04-28","start_date":"2025-04-24"},"article_processing_charge":"No","status":"public","acknowledgement":"M. M. and P. S. are funded by the European Union (ERC, INF2, project number 101161364). Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or the European Research Council Executive Agency. Neither the European Union nor the granting authority can be held responsible for them.","author":[{"last_name":"Jacot","full_name":"Jacot, Arthur","first_name":"Arthur"},{"first_name":"Peter","last_name":"Súkeník","id":"d64d6a8d-eb8e-11eb-b029-96fd216dec3c","full_name":"Súkeník, Peter"},{"full_name":"Wang, Zihan","last_name":"Wang","first_name":"Zihan"},{"first_name":"Marco","id":"27EB676C-8706-11E9-9510-7717E6697425","last_name":"Mondelli","orcid":"0000-0002-3242-7020","full_name":"Mondelli, Marco"}],"external_id":{"arxiv":["2410.04887"]},"file":[{"file_name":"2025_ICLR_Jacot.pdf","success":1,"file_size":1337236,"checksum":"59c48c173887139647cc9839c0801136","content_type":"application/pdf","access_level":"open_access","date_created":"2025-08-04T08:45:43Z","date_updated":"2025-08-04T08:45:43Z","creator":"dernst","file_id":"20114","relation":"main_file"}],"date_published":"2025-04-01T00:00:00Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","page":"1905-1931","publication":"13th International Conference on Learning Representations","language":[{"iso":"eng"}],"year":"2025","publication_identifier":{"isbn":["9798331320850"]},"OA_place":"publisher","_id":"20035","title":"Wide neural networks trained with weight decay provably exhibit neural collapse","tmp":{"image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"department":[{"_id":"MaMo"}],"date_updated":"2025-08-04T08:47:00Z","OA_type":"diamond","month":"04","project":[{"_id":"911e6d1f-16d5-11f0-9cad-c5c68c6a1cdf","grant_number":"101161364","name":"Inference in High Dimensions: Light-speed Algorithms and Information Limits"}],"arxiv":1,"oa_version":"Published Version","type":"conference","publisher":"ICLR","publication_status":"published"},{"publisher":"ICLR","type":"conference","publication_status":"published","oa_version":"Published Version","arxiv":1,"month":"04","OA_type":"diamond","date_updated":"2025-08-04T08:10:55Z","tmp":{"image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"title":"Near, far: Patch-ordering enhances vision foundation models' scene understanding","department":[{"_id":"FrLo"}],"_id":"20036","OA_place":"publisher","publication_identifier":{"isbn":["9798331320850"]},"year":"2025","page":"72303-72330","language":[{"iso":"eng"}],"publication":"13th International Conference on Learning Representations","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","file":[{"success":1,"file_name":"2025_ICLR_Pariza.pdf","file_size":37788223,"checksum":"ddbe981f3ad3f6cb6daf12c954822eb8","access_level":"open_access","content_type":"application/pdf","date_created":"2025-08-04T08:09:43Z","date_updated":"2025-08-04T08:09:43Z","creator":"dernst","file_id":"20109","relation":"main_file"}],"date_published":"2025-04-01T00:00:00Z","author":[{"first_name":"Valentinos","full_name":"Pariza, Valentinos","last_name":"Pariza"},{"first_name":"Mohammadreza","full_name":"Salehi, Mohammadreza","last_name":"Salehi"},{"last_name":"Burghouts","full_name":"Burghouts, Gertjan","first_name":"Gertjan"},{"orcid":"0000-0002-4850-0683","full_name":"Locatello, Francesco","id":"26cfd52f-2483-11ee-8040-88983bcc06d4","last_name":"Locatello","first_name":"Francesco"},{"full_name":"Asano, Yuki M.","last_name":"Asano","first_name":"Yuki M."}],"external_id":{"arxiv":["2408.11054"]},"status":"public","conference":{"location":"Singapore, Singapore","name":"ICLR: International Conference on Learning Representations","end_date":"2025-04-28","start_date":"2025-04-24"},"article_processing_charge":"No","has_accepted_license":"1","ddc":["000"],"abstract":[{"lang":"eng","text":"We introduce NeCo: Patch Neighbor Consistency, a novel self-supervised training loss that enforces patch-level nearest neighbor consistency across a student and teacher model. Compared to contrastive approaches that only yield binary learning signals, i.e. \"attract\" and \"repel\", this approach benefits from the more fine-grained learning signal of sorting spatially dense features relative to reference patches. Our method leverages differentiable sorting applied on top of pretrained representations, such as DINOv2-registers to bootstrap the learning signal and further improve upon them. This dense post-pretraining leads to superior performance across various models and datasets, despite requiring only 19 hours on a single GPU. This method generates high-quality dense feature encoders and establishes several new state-of-the-art results such as +2.3 % and +4.2% for non-parametric in-context semantic segmentation on ADE20k and Pascal VOC, +1.6% and +4.8% for linear segmentation evaluations on COCO-Things and -Stuff and improvements in the 3D understanding of multi-view consistency on SPair-71k, by more than 1.5%."}],"oa":1,"file_date_updated":"2025-08-04T08:09:43Z","citation":{"ama":"Pariza V, Salehi M, Burghouts G, Locatello F, Asano YM. Near, far: Patch-ordering enhances vision foundation models’ scene understanding. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:72303-72330.","ieee":"V. Pariza, M. Salehi, G. Burghouts, F. Locatello, and Y. M. Asano, “Near, far: Patch-ordering enhances vision foundation models’ scene understanding,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 72303–72330.","short":"V. Pariza, M. Salehi, G. Burghouts, F. Locatello, Y.M. Asano, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 72303–72330.","apa":"Pariza, V., Salehi, M., Burghouts, G., Locatello, F., &#38; Asano, Y. M. (2025). Near, far: Patch-ordering enhances vision foundation models’ scene understanding. In <i>13th International Conference on Learning Representations</i> (pp. 72303–72330). Singapore, Singapore: ICLR.","mla":"Pariza, Valentinos, et al. “Near, Far: Patch-Ordering Enhances Vision Foundation Models’ Scene Understanding.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 72303–30.","chicago":"Pariza, Valentinos, Mohammadreza Salehi, Gertjan Burghouts, Francesco Locatello, and Yuki M. Asano. “Near, Far: Patch-Ordering Enhances Vision Foundation Models’ Scene Understanding.” In <i>13th International Conference on Learning Representations</i>, 72303–30. ICLR, 2025.","ista":"Pariza V, Salehi M, Burghouts G, Locatello F, Asano YM. 2025. Near, far: Patch-ordering enhances vision foundation models’ scene understanding. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 72303–72330."},"date_created":"2025-07-20T22:02:03Z","quality_controlled":"1","scopus_import":"1","day":"01"},{"scopus_import":"1","date_created":"2025-07-20T22:02:03Z","quality_controlled":"1","citation":{"ista":"Sawmya S, Kong L, Markov I, Alistarh D-A, Shavit N. 2025. Wasserstein distances, neuronal entanglement, and sparsity. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 26244–26274.","mla":"Sawmya, Shashata, et al. “Wasserstein Distances, Neuronal Entanglement, and Sparsity.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 26244–74.","apa":"Sawmya, S., Kong, L., Markov, I., Alistarh, D.-A., &#38; Shavit, N. (2025). Wasserstein distances, neuronal entanglement, and sparsity. In <i>13th International Conference on Learning Representations</i> (pp. 26244–26274). Singapore, Singapore: ICLR.","chicago":"Sawmya, Shashata, Linghao Kong, Ilia Markov, Dan-Adrian Alistarh, and Nir Shavit. “Wasserstein Distances, Neuronal Entanglement, and Sparsity.” In <i>13th International Conference on Learning Representations</i>, 26244–74. ICLR, 2025.","ieee":"S. Sawmya, L. Kong, I. Markov, D.-A. Alistarh, and N. Shavit, “Wasserstein distances, neuronal entanglement, and sparsity,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 26244–26274.","short":"S. Sawmya, L. Kong, I. Markov, D.-A. Alistarh, N. Shavit, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 26244–26274.","ama":"Sawmya S, Kong L, Markov I, Alistarh D-A, Shavit N. Wasserstein distances, neuronal entanglement, and sparsity. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:26244-26274."},"day":"01","corr_author":"1","related_material":{"link":[{"relation":"software","url":"https://github.com/Shavit-Lab/Sparse-Expansion"}]},"abstract":[{"text":"Disentangling polysemantic neurons is at the core of many current approaches to interpretability of large language models. Here we attempt to study how disentanglement can be used to understand performance, particularly under weight sparsity, a leading post-training optimization technique. We suggest a novel measure for estimating neuronal entanglement: the Wasserstein distance of a neuron's output distribution to a Gaussian. Moreover, we show the existence of a small number of highly entangled \"Wasserstein Neurons\" in each linear layer of an LLM, characterized by their highly non-Gaussian output distributions, their role in mapping similar inputs to dissimilar outputs, and their significant impact on model accuracy. To study these phenomena, we propose a new experimental framework for disentangling polysemantic neurons. Our framework separates each layer's inputs to create a mixture of experts where each neuron's output is computed by a mixture of neurons of lower Wasserstein distance, each better at maintaining accuracy when sparsified without retraining. We provide strong evidence that this is because the mixture of sparse experts is effectively disentangling the input-output relationship of individual neurons, in particular the difficult Wasserstein neurons.","lang":"eng"}],"ddc":["000"],"file_date_updated":"2025-08-04T08:14:09Z","oa":1,"article_processing_charge":"No","conference":{"start_date":"2025-04-24","location":"Singapore, Singapore","name":"ICLR: International Conference on Learning Representations","end_date":"2025-04-28"},"has_accepted_license":"1","status":"public","acknowledgement":"The authors would like to extend their gratitude to Lori Leu for her insightful comments on the\r\napplication of the Wasserstein distance metric. We also wish to thank Elias Frantar for his help in\r\nworking with the SparseGPT implementation and his advice for the project. Additionally, we would like to thank Tony Tong Wang and Thomas Athey for their valuable feedback and constructive discussions.\r\nThis work was supported by an NIH Brains CONNECTS U01 grant and AMD’s AI & HPC Fund.","date_published":"2025-04-01T00:00:00Z","file":[{"success":1,"file_name":"2025_ICLR_Sawmya.pdf","file_size":5447177,"checksum":"39a8fa7dbdd7029859e156f53f20f6bc","date_created":"2025-08-04T08:14:09Z","access_level":"open_access","content_type":"application/pdf","date_updated":"2025-08-04T08:14:09Z","file_id":"20110","creator":"dernst","relation":"main_file"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","external_id":{"arxiv":["2405.15756"]},"author":[{"first_name":"Shashata","last_name":"Sawmya","full_name":"Sawmya, Shashata"},{"last_name":"Kong","full_name":"Kong, Linghao","first_name":"Linghao"},{"id":"D0CF4148-C985-11E9-8066-0BDEE5697425","last_name":"Markov","full_name":"Markov, Ilia","first_name":"Ilia"},{"first_name":"Dan-Adrian","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian"},{"last_name":"Shavit","full_name":"Shavit, Nir","first_name":"Nir"}],"language":[{"iso":"eng"}],"publication":"13th International Conference on Learning Representations","page":"26244-26274","year":"2025","OA_place":"publisher","publication_identifier":{"isbn":["9798331320850"]},"_id":"20037","tmp":{"image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"department":[{"_id":"DaAl"}],"title":"Wasserstein distances, neuronal entanglement, and sparsity","date_updated":"2025-08-04T08:16:43Z","OA_type":"diamond","month":"04","arxiv":1,"publisher":"ICLR","publication_status":"published","type":"conference","oa_version":"Published Version"},{"oa":1,"file_date_updated":"2025-08-04T08:23:47Z","ddc":["000"],"abstract":[{"text":"Pruning eliminates unnecessary parameters in neural networks; it offers a promising solution to the growing computational demands of large language models (LLMs). While many focus on post-training pruning, sparse pre-training--which combines pruning and pre-training into a single phase--provides a simpler alternative. In this work, we present the first systematic exploration of optimal sparse pre-training configurations for LLMs through an examination of 80 unique pruning schedules across different sparsity levels and training durations. We find that initiating pruning at 25% of total training compute and concluding at 75% achieves near-optimal final evaluation loss. These findings provide valuable insights for efficient and effective sparse pre-training of LLMs. Furthermore, we propose a new scaling law that modifies the Chinchilla scaling law to use the average parameter count over pre-training. Through empirical and theoretical validation, we demonstrate that this modified scaling law accurately models evaluation loss for both sparsely and densely pre-trained LLMs, unifying scaling laws across pre-training paradigms. Our findings indicate that while sparse pre-training achieves the same final model quality as dense pre-training for equivalent compute budgets, it provides substantial benefits through reduced model size, enabling significant potential computational savings during inference.","lang":"eng"}],"has_accepted_license":"1","conference":{"start_date":"2025-04-24","end_date":"2025-04-28","location":"Singapore, Singapore","name":"ICLR: International Conference on Learning Representations"},"article_processing_charge":"No","day":"01","date_created":"2025-07-20T22:02:03Z","citation":{"chicago":"Jin, Tian, Ahmed Imtiaz Humayun, Utku Evci, Suvinay Subramanian, Amir Yazdanbakhsh, Dan-Adrian Alistarh, and Gintare Karolina Dziugaite. “The Journey Matters: Average Parameter Count over Pre-Training Unifies Sparse and Dense Scaling Laws.” In <i>13th International Conference on Learning Representations</i>, 85165–81. ICLR, 2025.","apa":"Jin, T., Humayun, A. I., Evci, U., Subramanian, S., Yazdanbakhsh, A., Alistarh, D.-A., &#38; Dziugaite, G. K. (2025). The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. In <i>13th International Conference on Learning Representations</i> (pp. 85165–85181). Singapore, Singapore: ICLR.","mla":"Jin, Tian, et al. “The Journey Matters: Average Parameter Count over Pre-Training Unifies Sparse and Dense Scaling Laws.” <i>13th International Conference on Learning Representations</i>, ICLR, 2025, pp. 85165–81.","ista":"Jin T, Humayun AI, Evci U, Subramanian S, Yazdanbakhsh A, Alistarh D-A, Dziugaite GK. 2025. The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 85165–85181.","ieee":"T. Jin <i>et al.</i>, “The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws,” in <i>13th International Conference on Learning Representations</i>, Singapore, Singapore, 2025, pp. 85165–85181.","short":"T. Jin, A.I. Humayun, U. Evci, S. Subramanian, A. Yazdanbakhsh, D.-A. Alistarh, G.K. Dziugaite, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 85165–85181.","ama":"Jin T, Humayun AI, Evci U, et al. The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws. In: <i>13th International Conference on Learning Representations</i>. ICLR; 2025:85165-85181."},"quality_controlled":"1","scopus_import":"1","language":[{"iso":"eng"}],"publication":"13th International Conference on Learning Representations","page":"85165-85181","year":"2025","status":"public","acknowledgement":"We are deeply grateful to Elias Frantar, Naveen Kumar, Sanjiv Kumar, Daniel\r\nM. Roy, and Clemens Schaefer for their valuable feedback and thoughtful review of this paper.\r\nWe also acknowledge the critical support provided by the Google CoreML Performance Team, and Google Research during this project. We further recognize the extended team at Google DeepMind, who enabled and supported this research direction.\r\nThis work was in part supported by the Sloan Foundation, the MIT-IBM Watson AI Lab, Apple, and SRC JUMP 2.0 (CoCoSys).","author":[{"last_name":"Jin","full_name":"Jin, Tian","first_name":"Tian"},{"first_name":"Ahmed Imtiaz","last_name":"Humayun","full_name":"Humayun, Ahmed Imtiaz"},{"first_name":"Utku","full_name":"Evci, Utku","last_name":"Evci"},{"first_name":"Suvinay","last_name":"Subramanian","full_name":"Subramanian, Suvinay"},{"last_name":"Yazdanbakhsh","full_name":"Yazdanbakhsh, Amir","first_name":"Amir"},{"first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"},{"first_name":"Gintare Karolina","full_name":"Dziugaite, Gintare Karolina","last_name":"Dziugaite"}],"external_id":{"arxiv":["2501.12486 "]},"date_published":"2025-04-01T00:00:00Z","file":[{"creator":"dernst","file_id":"20111","relation":"main_file","date_updated":"2025-08-04T08:23:47Z","checksum":"dbc27120e9aba67dffbd9e5d513a6803","content_type":"application/pdf","access_level":"open_access","date_created":"2025-08-04T08:23:47Z","success":1,"file_name":"2025_ICLR_Jin.pdf","file_size":704989}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","title":"The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws","tmp":{"image":"/images/cc_by.png","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","short":"CC BY (4.0)"},"department":[{"_id":"DaAl"}],"date_updated":"2025-08-04T08:24:59Z","publication_identifier":{"isbn":["9798331320850"]},"OA_place":"publisher","_id":"20038","arxiv":1,"oa_version":"Published Version","publisher":"ICLR","publication_status":"published","type":"conference","OA_type":"diamond","month":"04"}]