[{"ddc":["000"],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2025","intvolume":"       267","OA_place":"publisher","date_published":"2025-07-30T00:00:00Z","quality_controlled":"1","type":"conference","external_id":{"arxiv":["2501.19104"]},"abstract":[{"text":"Neural Collapse is a phenomenon where the last-layer representations of a well-trained neural network converge to a highly structured geometry. In this paper, we focus on its first (and most basic) property, known as NC1: the within-class variability vanishes. While prior theoretical studies establish the occurrence of NC1 via the data-agnostic unconstrained features model, our work adopts a data-specific perspective, analyzing NC1 in a three-layer neural network, with the first two layers operating in the mean-field regime and followed by a linear layer. In particular, we establish a fundamental connection between NC1 and the loss landscape: we prove that points with small empirical loss and gradient norm (thus, close to being stationary) approximately satisfy NC1, and the closeness to NC1 is controlled by the residual loss and gradient norm. We then show that (i) gradient flow on the mean squared error converges to NC1 solutions with small empirical loss, and (ii) for well-separated data distributions, both NC1 and vanishing test loss are achieved simultaneously. This aligns with the empirical observation that NC1 emerges during training while models attain near-zero test error. Overall, our results demonstrate that NC1 arises from gradient training due to the properties of the loss landscape, and they show the co-occurrence of NC1 and small test error for certain data distributions.","lang":"eng"}],"_id":"21326","language":[{"iso":"eng"}],"day":"30","citation":{"apa":"Wu, D., &#38; Mondelli, M. (2025). Neural collapse beyond the unconstrained features model: Landscape, dynamics, and generalization in the mean-field regime. In <i>Proceedings of the 42nd International Conference on Machine Learning</i> (Vol. 267, pp. 67499–67536). Vancouver, Canada: ML Research Press.","mla":"Wu, Diyuan, and Marco Mondelli. “Neural Collapse beyond the Unconstrained Features Model: Landscape, Dynamics, and Generalization in the Mean-Field Regime.” <i>Proceedings of the 42nd International Conference on Machine Learning</i>, vol. 267, ML Research Press, 2025, pp. 67499–536.","short":"D. Wu, M. Mondelli, in:, Proceedings of the 42nd International Conference on Machine Learning, ML Research Press, 2025, pp. 67499–67536.","ieee":"D. Wu and M. Mondelli, “Neural collapse beyond the unconstrained features model: Landscape, dynamics, and generalization in the mean-field regime,” in <i>Proceedings of the 42nd International Conference on Machine Learning</i>, Vancouver, Canada, 2025, vol. 267, pp. 67499–67536.","chicago":"Wu, Diyuan, and Marco Mondelli. “Neural Collapse beyond the Unconstrained Features Model: Landscape, Dynamics, and Generalization in the Mean-Field Regime.” In <i>Proceedings of the 42nd International Conference on Machine Learning</i>, 267:67499–536. ML Research Press, 2025.","ama":"Wu D, Mondelli M. Neural collapse beyond the unconstrained features model: Landscape, dynamics, and generalization in the mean-field regime. In: <i>Proceedings of the 42nd International Conference on Machine Learning</i>. Vol 267. ML Research Press; 2025:67499-67536.","ista":"Wu D, Mondelli M. 2025. Neural collapse beyond the unconstrained features model: Landscape, dynamics, and generalization in the mean-field regime. Proceedings of the 42nd International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 267, 67499–67536."},"has_accepted_license":"1","page":"67499-67536","status":"public","file_date_updated":"2026-02-19T08:28:22Z","publication":"Proceedings of the 42nd International Conference on Machine Learning","date_updated":"2026-02-19T08:30:42Z","arxiv":1,"publication_identifier":{"eissn":["2640-3498"]},"OA_type":"gold","oa_version":"Published Version","title":"Neural collapse beyond the unconstrained features model: Landscape, dynamics, and generalization in the mean-field regime","date_created":"2026-02-18T12:02:45Z","alternative_title":["PMLR"],"conference":{"start_date":"2025-07-13","name":"ICML: International Conference on Machine Learning","location":"Vancouver, Canada","end_date":"2025-07-19"},"publication_status":"published","file":[{"access_level":"open_access","success":1,"content_type":"application/pdf","date_created":"2026-02-19T08:28:22Z","file_size":3994385,"checksum":"c5ce8b1c83e33dc3a11122f4910deb67","file_name":"2025_ICML_Wu.pdf","creator":"dernst","relation":"main_file","file_id":"21337","date_updated":"2026-02-19T08:28:22Z"}],"publisher":"ML Research Press","article_processing_charge":"No","tmp":{"short":"CC BY (4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)"},"corr_author":"1","oa":1,"acknowledgement":"This research was funded in whole or in part by the Austrian Science Fund (FWF) 10.55776/COE12. For the purpose of open access, the authors have applied a CC BY public\r\ncopyright license to any Author Accepted Manuscript version arising from this submission. The authors would like to thank Peter Sukenık for general helpful discussions and for pointing out that all the stationary points are approximately proportional in the case without entropic regularization. ","month":"07","department":[{"_id":"MaMo"}],"author":[{"last_name":"Wu","full_name":"Wu, Diyuan","first_name":"Diyuan","id":"1a5914c2-896a-11ed-bdf8-fb80621a0635"},{"last_name":"Mondelli","full_name":"Mondelli, Marco","first_name":"Marco","id":"27EB676C-8706-11E9-9510-7717E6697425","orcid":"0000-0002-3242-7020"}],"volume":267},{"acknowledgement":"The authors thank the anonymous NeurIPS reviewers for their useful comments and feedback, the IT department from the Institute of Science and Technology Austria for the hardware support, and Weights and Biases for the infrastructure to track all our experiments. Mher Safaryan has received funding from the European Union’s Horizon 2020 research and innovation program under the Maria Skłodowska-Curie grant agreement No 101034413.","oa":1,"project":[{"name":"IST-BRIDGE: International postdoctoral program","call_identifier":"H2020","_id":"fc2ed2f7-9c52-11eb-aca3-c01059dda49c","grant_number":"101034413"}],"department":[{"_id":"DaAl"},{"_id":"MaMo"}],"month":"12","volume":37,"author":[{"id":"1a5914c2-896a-11ed-bdf8-fb80621a0635","first_name":"Diyuan","last_name":"Wu","full_name":"Wu, Diyuan"},{"id":"449f7a18-f128-11eb-9611-9b430c0c6333","first_name":"Ionut-Vlad","full_name":"Modoranu, Ionut-Vlad","last_name":"Modoranu"},{"first_name":"Mher","id":"dd546b39-0804-11ed-9c55-ef075c39778d","last_name":"Safaryan","full_name":"Safaryan, Mher"},{"last_name":"Kuznedelev","full_name":"Kuznedelev, Denis","first_name":"Denis"},{"last_name":"Alistarh","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87"}],"publisher":"Neural Information Processing Systems Foundation","corr_author":"1","article_processing_charge":"No","conference":{"end_date":"2024-12-15","location":"Vancouver, Canada","name":"NeurIPS: Neural Information Processing Systems","start_date":"2024-12-09"},"publication_status":"published","publication_identifier":{"issn":["1049-5258"]},"arxiv":1,"oa_version":"Preprint","OA_type":"green","alternative_title":["Advances in Neural Information Processing Systems"],"title":"The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information","date_created":"2025-04-06T22:01:32Z","acknowledged_ssus":[{"_id":"CampIT"}],"date_updated":"2025-05-14T11:37:10Z","publication":"38th Conference on Neural Information Processing Systems","status":"public","scopus_import":"1","ec_funded":1,"type":"conference","quality_controlled":"1","language":[{"iso":"eng"}],"_id":"19518","abstract":[{"text":"The rising footprint of machine learning has led to a focus on imposing model\r\nsparsity as a means of reducing computational and memory costs. For deep neural\r\nnetworks (DNNs), the state-of-the-art accuracy-vs-sparsity is achieved by heuristics\r\ninspired by the classical Optimal Brain Surgeon (OBS) framework [LeCun et al.,\r\n1989, Hassibi and Stork, 1992, Hassibi et al., 1993], which leverages loss curvature\r\ninformation to make better pruning decisions. Yet, these results still lack a solid\r\ntheoretical understanding, and it is unclear whether they can be improved by\r\nleveraging connections to the wealth of work on sparse recovery algorithms. In this\r\npaper, we draw new connections between these two areas and present new sparse\r\nrecovery algorithms inspired by the OBS framework that comes with theoretical\r\nguarantees under reasonable assumptions and have strong practical performance.\r\nSpecifically, our work starts from the observation that we can leverage curvature\r\ninformation in OBS-like fashion upon the projection step of classic iterative sparse\r\nrecovery algorithms such as IHT. We show for the first time that this leads both\r\nto improved convergence bounds under standard assumptions. Furthermore, we\r\npresent extensions of this approach to the practical task of obtaining accurate sparse\r\nDNNs, and validate it experimentally at scale for Transformer-based models on\r\nvision and language tasks.","lang":"eng"}],"external_id":{"arxiv":["2408.17163"]},"citation":{"ama":"Wu D, Modoranu I-V, Safaryan M, Kuznedelev D, Alistarh D-A. The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. In: <i>38th Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","ista":"Wu D, Modoranu I-V, Safaryan M, Kuznedelev D, Alistarh D-A. 2024. The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. 38th Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","chicago":"Wu, Diyuan, Ionut-Vlad Modoranu, Mher Safaryan, Denis Kuznedelev, and Dan-Adrian Alistarh. “The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by Leveraging Second-Order Information.” In <i>38th Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","mla":"Wu, Diyuan, et al. “The Iterative Optimal Brain Surgeon: Faster Sparse Recovery by Leveraging Second-Order Information.” <i>38th Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","ieee":"D. Wu, I.-V. Modoranu, M. Safaryan, D. Kuznedelev, and D.-A. Alistarh, “The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information,” in <i>38th Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","short":"D. Wu, I.-V. Modoranu, M. Safaryan, D. Kuznedelev, D.-A. Alistarh, in:, 38th Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","apa":"Wu, D., Modoranu, I.-V., Safaryan, M., Kuznedelev, D., &#38; Alistarh, D.-A. (2024). The iterative optimal brain surgeon: Faster sparse recovery by leveraging second-order information. In <i>38th Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation."},"day":"20","year":"2024","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","intvolume":"        37","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2408.17163"}],"date_published":"2024-12-20T00:00:00Z","OA_place":"repository"},{"publication":"Transactions on Machine Learning Research","date_updated":"2025-04-15T07:50:17Z","status":"public","day":"28","citation":{"apa":"Wu, D., Kungurtsev, V., &#38; Mondelli, M. (2023). Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence. In <i>Transactions on Machine Learning Research</i>. ML Research Press.","short":"D. Wu, V. Kungurtsev, M. Mondelli, in:, Transactions on Machine Learning Research, ML Research Press, 2023.","ieee":"D. Wu, V. Kungurtsev, and M. Mondelli, “Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence,” in <i>Transactions on Machine Learning Research</i>, 2023.","mla":"Wu, Diyuan, et al. “Mean-Field Analysis for Heavy Ball Methods: Dropout-Stability, Connectivity, and Global Convergence.” <i>Transactions on Machine Learning Research</i>, ML Research Press, 2023.","chicago":"Wu, Diyuan, Vyacheslav Kungurtsev, and Marco Mondelli. “Mean-Field Analysis for Heavy Ball Methods: Dropout-Stability, Connectivity, and Global Convergence.” In <i>Transactions on Machine Learning Research</i>. ML Research Press, 2023.","ista":"Wu D, Kungurtsev V, Mondelli M. 2023. Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence. Transactions on Machine Learning Research. , TMLR, .","ama":"Wu D, Kungurtsev V, Mondelli M. Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence. In: <i>Transactions on Machine Learning Research</i>. ML Research Press; 2023."},"has_accepted_license":"1","external_id":{"arxiv":["2210.06819"]},"language":[{"iso":"eng"}],"_id":"14924","abstract":[{"text":"The stochastic heavy ball method (SHB), also known as stochastic gradient descent (SGD) with Polyak's momentum, is widely used in training neural networks. However, despite the remarkable success of such algorithm in practice, its theoretical characterization remains limited. In this paper, we focus on neural networks with two and three layers and provide a rigorous understanding of the properties of the solutions found by SHB: \\emph{(i)} stability after dropping out part of the neurons, \\emph{(ii)} connectivity along a low-loss path, and \\emph{(iii)} convergence to the global optimum.\r\nTo achieve this goal, we take a mean-field view and relate the SHB dynamics to a certain partial differential equation in the limit of large network widths. This mean-field perspective has inspired a recent line of work focusing on SGD while, in contrast, our paper considers an algorithm with momentum. More specifically, after proving existence and uniqueness of the limit differential equations, we show convergence to the global optimum and give a quantitative bound between the mean-field limit and the SHB dynamics of a finite-width network. Armed with this last bound, we are able to establish the dropout-stability and connectivity of SHB solutions.","lang":"eng"}],"quality_controlled":"1","type":"conference","date_published":"2023-02-28T00:00:00Z","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2210.06819"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","year":"2023","author":[{"full_name":"Wu, Diyuan","last_name":"Wu","id":"1a5914c2-896a-11ed-bdf8-fb80621a0635","first_name":"Diyuan"},{"full_name":"Kungurtsev, Vyacheslav","last_name":"Kungurtsev","first_name":"Vyacheslav"},{"full_name":"Mondelli, Marco","last_name":"Mondelli","first_name":"Marco","id":"27EB676C-8706-11E9-9510-7717E6697425","orcid":"0000-0002-3242-7020"}],"department":[{"_id":"MaMo"}],"month":"02","project":[{"_id":"059876FA-7A3F-11EA-A408-12923DDC885E","name":"Prix Lopez-Loretta 2019 - Marco Mondelli"}],"oa":1,"acknowledgement":"D. Wu and M. Mondelli are partially supported by the 2019 Lopez-Loreta Prize. V. Kungurtsev was supported by the OP VVV project CZ.02.1.01/0.0/0.0/16_019/0000765 \"Research Center for Informatics\".","article_processing_charge":"No","tmp":{"short":"CC BY (4.0)","image":"/images/cc_by.png","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)"},"corr_author":"1","publisher":"ML Research Press","publication_status":"published","title":"Mean-field analysis for heavy ball methods: Dropout-stability, connectivity, and global convergence","alternative_title":["TMLR"],"date_created":"2024-02-02T11:21:56Z","oa_version":"Published Version","arxiv":1}]
