[{"language":[{"iso":"eng"}],"date_updated":"2024-10-01T09:30:03Z","external_id":{"arxiv":["2402.04054"]},"status":"public","article_processing_charge":"No","department":[{"_id":"ChLa"}],"publication":"Proceedings of the 41st International Conference on Machine Learning","type":"conference","year":"2024","publisher":"ML Research Press","conference":{"end_date":"2024-07-27","name":"ICML: International Conference on Machine Learning","start_date":"2024-07-21","location":"Vienna, Austria"},"month":"09","corr_author":"1","day":"01","abstract":[{"lang":"eng","text":"We introduce a new framework for studying meta-learning methods using PAC-Bayesian theory. Its main advantage over previous work is that it allows for more flexibility in how the transfer of knowledge between tasks is realized. For previous approaches, this could only happen indirectly, by means of learning prior distributions over models. In contrast, the new generalization bounds that we prove express the process of meta-learning much more directly as learning the learning algorithm that should be used for future tasks. The flexibility of our framework makes it suitable to analyze a wide range of meta-learning mechanisms and even design new mechanisms. Other than our theoretical contributions we also show empirically that our framework improves the prediction quality in practical meta-learning mechanisms."}],"publication_status":"published","oa_version":"Published Version","title":"More flexible PAC-Bayesian meta-learning by learning learning algorithms","page":"58122-58139","main_file_link":[{"url":" https://doi.org/10.48550/arXiv.2402.04054","open_access":"1"}],"quality_controlled":"1","intvolume":"       235","alternative_title":["PMLR"],"publication_identifier":{"eissn":["2640-3498"]},"volume":235,"date_created":"2024-09-22T22:01:45Z","author":[{"full_name":"Zakerinia, Hossein","id":"653bd8b6-f394-11eb-9cf6-c0bbf6cd78d4","first_name":"Hossein","last_name":"Zakerinia"},{"last_name":"Behjati","first_name":"Amin","full_name":"Behjati, Amin"},{"full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887"}],"scopus_import":"1","_id":"18118","arxiv":1,"citation":{"apa":"Zakerinia, H., Behjati, A., &#38; Lampert, C. (2024). More flexible PAC-Bayesian meta-learning by learning learning algorithms. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 58122–58139). Vienna, Austria: ML Research Press.","chicago":"Zakerinia, Hossein, Amin Behjati, and Christoph Lampert. “More Flexible PAC-Bayesian Meta-Learning by Learning Learning Algorithms.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:58122–39. ML Research Press, 2024.","mla":"Zakerinia, Hossein, et al. “More Flexible PAC-Bayesian Meta-Learning by Learning Learning Algorithms.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 58122–39.","short":"H. Zakerinia, A. Behjati, C. Lampert, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 58122–58139.","ama":"Zakerinia H, Behjati A, Lampert C. More flexible PAC-Bayesian meta-learning by learning learning algorithms. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:58122-58139.","ieee":"H. Zakerinia, A. Behjati, and C. Lampert, “More flexible PAC-Bayesian meta-learning by learning learning algorithms,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 58122–58139.","ista":"Zakerinia H, Behjati A, Lampert C. 2024. More flexible PAC-Bayesian meta-learning by learning learning algorithms. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 58122–58139."},"date_published":"2024-09-01T00:00:00Z","oa":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87"},{"abstract":[{"lang":"eng","text":"This research is aimed to solve the tweet/user geolocation prediction task and provide a flexible methodology for the geo-tagging of textual big data. The suggested approach implements neural networks for natural language processing (NLP) to estimate the location as coordinate pairs (longitude, latitude) and two-dimensional Gaussian Mixture Models (GMMs). The scope of proposed models has been finetuned on a Twitter dataset using pretrained Bidirectional Encoder Representations from Transformers (BERT) as base models. Performance metrics show a median error of fewer than 30 km on a worldwide-level, and fewer than 15 km on the US-level datasets for the models trained and evaluated on text features of tweets' content and metadata context. Our source code and data are available at https://github.com/K4TEL/geo-twitter.git."}],"title":"Predicting the geolocation of tweets using transformer models on customized data","oa_version":"Published Version","DOAJ_listed":"1","publication_status":"published","page":"69-99","quality_controlled":"1","file_date_updated":"2025-01-20T08:41:10Z","acknowledgement":"The authors acknowledge the Institute of Science and Technology (ISTA) for their material support and for granting access to the Twitter database archive, which was essential for the research.","publication_identifier":{"eissn":["1948-660X"]},"article_type":"original","_id":"18856","OA_place":"publisher","scopus_import":"1","date_created":"2025-01-19T23:01:53Z","author":[{"full_name":"Lutsai, Kateryna","last_name":"Lutsai","first_name":"Kateryna"},{"first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887","full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"}],"doi":"10.5311/JOSIS.2024.29.295","user_id":"68b8ca59-c5b3-11ee-8790-cd641c68093d","oa":1,"date_published":"2024-12-26T00:00:00Z","citation":{"ieee":"K. Lutsai and C. Lampert, “Predicting the geolocation of tweets using transformer models on customized data,” <i>Journal of Spatial Information Science</i>, no. 29. University of Maine, pp. 69–99, 2024.","ista":"Lutsai K, Lampert C. 2024. Predicting the geolocation of tweets using transformer models on customized data. Journal of Spatial Information Science. (29), 69–99.","apa":"Lutsai, K., &#38; Lampert, C. (2024). Predicting the geolocation of tweets using transformer models on customized data. <i>Journal of Spatial Information Science</i>. University of Maine. <a href=\"https://doi.org/10.5311/JOSIS.2024.29.295\">https://doi.org/10.5311/JOSIS.2024.29.295</a>","ama":"Lutsai K, Lampert C. Predicting the geolocation of tweets using transformer models on customized data. <i>Journal of Spatial Information Science</i>. 2024;(29):69-99. doi:<a href=\"https://doi.org/10.5311/JOSIS.2024.29.295\">10.5311/JOSIS.2024.29.295</a>","short":"K. Lutsai, C. Lampert, Journal of Spatial Information Science (2024) 69–99.","mla":"Lutsai, Kateryna, and Christoph Lampert. “Predicting the Geolocation of Tweets Using Transformer Models on Customized Data.” <i>Journal of Spatial Information Science</i>, no. 29, University of Maine, 2024, pp. 69–99, doi:<a href=\"https://doi.org/10.5311/JOSIS.2024.29.295\">10.5311/JOSIS.2024.29.295</a>.","chicago":"Lutsai, Kateryna, and Christoph Lampert. “Predicting the Geolocation of Tweets Using Transformer Models on Customized Data.” <i>Journal of Spatial Information Science</i>. University of Maine, 2024. <a href=\"https://doi.org/10.5311/JOSIS.2024.29.295\">https://doi.org/10.5311/JOSIS.2024.29.295</a>."},"related_material":{"link":[{"url":"https://github.com/K4TEL/geo-twitter.git","relation":"software"}]},"language":[{"iso":"eng"}],"file":[{"file_name":"2024_JourSpatialInfoScience_Lutsai.pdf","date_created":"2025-01-20T08:41:10Z","file_id":"18857","checksum":"b82413f00398ffb5168e8e747571a98d","creator":"dernst","access_level":"open_access","relation":"main_file","content_type":"application/pdf","date_updated":"2025-01-20T08:41:10Z","file_size":7250655,"success":1}],"date_updated":"2025-06-05T13:47:12Z","ddc":["500"],"status":"public","publication":"Journal of Spatial Information Science","has_accepted_license":"1","department":[{"_id":"ChLa"}],"article_processing_charge":"Yes","tmp":{"name":"Creative Commons Attribution 3.0 Unported (CC BY 3.0)","short":"CC BY (3.0)","legal_code_url":"https://creativecommons.org/licenses/by/3.0/legalcode","image":"/images/cc_by.png"},"year":"2024","type":"journal_article","corr_author":"1","month":"12","publisher":"University of Maine","issue":"29","day":"26","OA_type":"gold","license":"https://creativecommons.org/licenses/by/3.0/"},{"publication_identifier":{"eissn":["1049-5258"]},"volume":37,"alternative_title":["Advances in Neural Information Processing Systems"],"intvolume":"        37","oa":1,"date_published":"2024-12-01T00:00:00Z","arxiv":1,"citation":{"ieee":"N. Kalinin and C. Lampert, “Banded square root matrix factorization for differentially private model training,” in <i>38th Annual Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","ista":"Kalinin N, Lampert C. 2024. Banded square root matrix factorization for differentially private model training. 38th Annual Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","apa":"Kalinin, N., &#38; Lampert, C. (2024). Banded square root matrix factorization for differentially private model training. In <i>38th Annual Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation.","ama":"Kalinin N, Lampert C. Banded square root matrix factorization for differentially private model training. In: <i>38th Annual Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","chicago":"Kalinin, Nikita, and Christoph Lampert. “Banded Square Root Matrix Factorization for Differentially Private Model Training.” In <i>38th Annual Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","mla":"Kalinin, Nikita, and Christoph Lampert. “Banded Square Root Matrix Factorization for Differentially Private Model Training.” <i>38th Annual Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","short":"N. Kalinin, C. Lampert, in:, 38th Annual Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024."},"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_created":"2025-01-24T17:58:16Z","author":[{"last_name":"Kalinin","first_name":"Nikita","id":"4b14526e-14d2-11ed-ba64-c14c9553d137","full_name":"Kalinin, Nikita"},{"full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert"}],"_id":"18875","OA_place":"publisher","scopus_import":"1","publication_status":"published","oa_version":"Published Version","title":"Banded square root matrix factorization for differentially private model training","abstract":[{"text":"Current state-of-the-art methods for differentially private model training are based on matrix factorization techniques. However, these methods suffer from high computational overhead because they require numerically solving a demanding optimization problem to determine an approximately optimal factorization prior to the actual model training. In this work, we present a new matrix factorization approach, BSR, which overcomes this computational bottleneck. By exploiting properties of the standard matrix square root, BSR allows to efficiently handle also large-scale problems. For the key scenario of stochastic gradient descent with momentum and weight decay, we even derive analytical expressions for BSR that render the computational overhead negligible. We prove bounds on the approximation quality that hold both in the centralized and in the federated learning setting. Our numerical experiments demonstrate that models trained using BSR perform on par with the best existing methods, while completely avoiding their computational overhead.","lang":"eng"}],"file_date_updated":"2025-01-27T09:52:15Z","quality_controlled":"1","type":"conference","year":"2024","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png"},"day":"01","OA_type":"gold","license":"https://creativecommons.org/licenses/by/4.0/","conference":{"end_date":"2024-12-16","name":"NeurIPS: Neural Information Processing Systems","start_date":"2024-12-16","location":"Vancouver, Canada"},"publisher":"Neural Information Processing Systems Foundation","corr_author":"1","month":"12","date_updated":"2025-05-14T11:34:20Z","ddc":["000"],"file":[{"relation":"main_file","checksum":"a216cab8eddc1fe7840aede0e2c0d41e","date_created":"2025-01-27T09:52:15Z","file_id":"18888","file_name":"2024_NeurIPS_Nikita.pdf","creator":"dernst","access_level":"open_access","content_type":"application/pdf","file_size":1144656,"date_updated":"2025-01-27T09:52:15Z","success":1}],"language":[{"iso":"eng"}],"article_processing_charge":"No","publication":"38th Annual Conference on Neural Information Processing Systems","has_accepted_license":"1","department":[{"_id":"GradSch"},{"_id":"ChLa"}],"external_id":{"arxiv":["2405.13763"]},"status":"public"},{"file_date_updated":"2025-02-04T08:11:25Z","quality_controlled":"1","acknowledged_ssus":[{"_id":"ScienComp"}],"abstract":[{"text":"Deep neural networks (DNNs) exhibit a surprising structure in their final layer\r\nknown as neural collapse (NC), and a growing body of works has currently investigated the propagation of neural collapse to earlier layers of DNNs – a phenomenon\r\ncalled deep neural collapse (DNC). However, existing theoretical results are restricted to special cases: linear models, only two layers or binary classification.\r\nIn contrast, we focus on non-linear models of arbitrary depth in multi-class classification and reveal a surprising qualitative shift. As soon as we go beyond two\r\nlayers or two classes, DNC stops being optimal for the deep unconstrained features\r\nmodel (DUFM) – the standard theoretical framework for the analysis of collapse.\r\nThe main culprit is a low-rank bias of multi-layer regularization schemes: this bias\r\nleads to optimal solutions of even lower rank than the neural collapse. We support\r\nour theoretical findings with experiments on both DUFM and real data, which show\r\nthe emergence of the low-rank structure in the solution found by gradient descent.","lang":"eng"}],"publication_status":"published","oa_version":"Published Version","title":"Neural collapse versus low-rank bias: Is deep neural collapse really optimal?","author":[{"id":"d64d6a8d-eb8e-11eb-b029-96fd216dec3c","full_name":"Súkeník, Peter","last_name":"Súkeník","first_name":"Peter"},{"last_name":"Lampert","orcid":"0000-0001-8622-7887","first_name":"Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","full_name":"Lampert, Christoph"},{"id":"27EB676C-8706-11E9-9510-7717E6697425","full_name":"Mondelli, Marco","last_name":"Mondelli","orcid":"0000-0002-3242-7020","first_name":"Marco"}],"date_created":"2025-01-27T11:15:18Z","_id":"18891","OA_place":"publisher","oa":1,"date_published":"2024-12-01T00:00:00Z","citation":{"ista":"Súkeník P, Lampert C, Mondelli M. 2024. Neural collapse versus low-rank bias: Is deep neural collapse really optimal? 38th Annual Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, Advances in Neural Information Processing Systems, vol. 37.","ieee":"P. Súkeník, C. Lampert, and M. Mondelli, “Neural collapse versus low-rank bias: Is deep neural collapse really optimal?,” in <i>38th Annual Conference on Neural Information Processing Systems</i>, Vancouver, Canada, 2024, vol. 37.","short":"P. Súkeník, C. Lampert, M. Mondelli, in:, 38th Annual Conference on Neural Information Processing Systems, Neural Information Processing Systems Foundation, 2024.","mla":"Súkeník, Peter, et al. “Neural Collapse versus Low-Rank Bias: Is Deep Neural Collapse Really Optimal?” <i>38th Annual Conference on Neural Information Processing Systems</i>, vol. 37, Neural Information Processing Systems Foundation, 2024.","chicago":"Súkeník, Peter, Christoph Lampert, and Marco Mondelli. “Neural Collapse versus Low-Rank Bias: Is Deep Neural Collapse Really Optimal?” In <i>38th Annual Conference on Neural Information Processing Systems</i>, Vol. 37. Neural Information Processing Systems Foundation, 2024.","ama":"Súkeník P, Lampert C, Mondelli M. Neural collapse versus low-rank bias: Is deep neural collapse really optimal? In: <i>38th Annual Conference on Neural Information Processing Systems</i>. Vol 37. Neural Information Processing Systems Foundation; 2024.","apa":"Súkeník, P., Lampert, C., &#38; Mondelli, M. (2024). Neural collapse versus low-rank bias: Is deep neural collapse really optimal? In <i>38th Annual Conference on Neural Information Processing Systems</i> (Vol. 37). Vancouver, Canada: Neural Information Processing Systems Foundation."},"arxiv":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","intvolume":"        37","acknowledgement":"Marco Mondelli is partially supported by the 2019 Lopez-Loreta prize. This research was supported by the Scientific Service Units (SSU) of ISTA through resources provided by Scientific Computing (SciComp).","volume":37,"alternative_title":["Advances in Neural Information Processing Systems"],"external_id":{"arxiv":["2405.14468"]},"status":"public","article_processing_charge":"No","publication":"38th Annual Conference on Neural Information Processing Systems","has_accepted_license":"1","department":[{"_id":"GradSch"},{"_id":"MaMo"},{"_id":"ChLa"}],"language":[{"iso":"eng"}],"ddc":["000"],"file":[{"creator":"dernst","access_level":"open_access","file_name":"2024_NeurIPS_Sukenik.pdf","checksum":"b7b79f1ea3ac1e9e11b3d91faaeb0780","date_created":"2025-02-04T08:11:25Z","file_id":"18989","relation":"main_file","success":1,"content_type":"application/pdf","date_updated":"2025-02-04T08:11:25Z","file_size":1784118}],"date_updated":"2025-06-04T07:19:21Z","conference":{"location":"Vancouver, Canada","name":"NeurIPS: Neural Information Processing Systems","start_date":"2024-12-16","end_date":"2024-12-16"},"publisher":"Neural Information Processing Systems Foundation","corr_author":"1","month":"12","day":"01","OA_type":"gold","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png"},"type":"conference","project":[{"_id":"059876FA-7A3F-11EA-A408-12923DDC885E","name":"Prix Lopez-Loretta 2019 - Marco Mondelli"}],"year":"2024"},{"article_processing_charge":"No","has_accepted_license":"1","department":[{"_id":"GradSch"},{"_id":"ChLa"}],"publication":"arXiv","article_number":"2403.06833","external_id":{"arxiv":["2403.06833"]},"status":"public","date_updated":"2025-02-24T12:52:23Z","ddc":["000"],"file":[{"file_size":530972,"date_updated":"2025-02-20T10:11:45Z","content_type":"application/pdf","success":1,"relation":"main_file","file_name":"2403.06833v3.pdf","file_id":"19064","date_created":"2025-02-20T10:11:45Z","checksum":"35eb43968684b87be59144603ef10af0","access_level":"open_access","creator":"ezverev"}],"language":[{"iso":"eng"}],"related_material":{"link":[{"relation":"software","url":" https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed"}]},"OA_type":"green","license":"https://creativecommons.org/licenses/by-sa/4.0/","day":"01","month":"03","corr_author":"1","type":"preprint","year":"2024","tmp":{"short":"CC BY-SA (4.0)","name":"Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)","legal_code_url":"https://creativecommons.org/licenses/by-sa/4.0/legalcode","image":"/images/cc_by_sa.png"},"file_date_updated":"2025-02-20T10:11:45Z","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2403.06833","open_access":"1"}],"publication_status":"published","oa_version":"Preprint","title":"Can LLMs separate instructions from data? And what do we even mean by that?","acknowledged_ssus":[{"_id":"ScienComp"}],"abstract":[{"lang":"eng","text":"Instruction-tuned Large Language Models (LLMs) show impressive results in numerous practical applications, but they lack essential safety features that are common in other areas of computer science, particularly an explicit separation of instructions and data. This makes them vulnerable to manipulations such as indirect prompt injections and generally unsuitable for safety-critical tasks. Surprisingly, there is currently no established definition or benchmark to quantify this phenomenon. In this work, we close this gap by introducing a formal measure for instruction-data separation and an empirical variant that is calculable from a model's outputs. We also present a new dataset, SEP, that allows estimating the measure for real-world models. Our results on various LLMs show that the problem of instruction-data separation is real: all models fail to achieve high separation, and canonical mitigation techniques, such as prompt engineering and fine-tuning, either fail to substantially improve separation or reduce model utility. The source code and SEP dataset are openly accessible at https://github.com/egozverev/Shold-It-Be-Executed-Or-Processed.\r\n"}],"arxiv":1,"citation":{"ieee":"E. Zverev, S. Abdelnabi, S. Tabesh, M. Fritz, and C. Lampert, “Can LLMs separate instructions from data? And what do we even mean by that?,” <i>arXiv</i>. 2024.","ista":"Zverev E, Abdelnabi S, Tabesh S, Fritz M, Lampert C. 2024. Can LLMs separate instructions from data? And what do we even mean by that? arXiv, 2403.06833.","apa":"Zverev, E., Abdelnabi, S., Tabesh, S., Fritz, M., &#38; Lampert, C. (2024). Can LLMs separate instructions from data? And what do we even mean by that? <i>arXiv</i>. <a href=\"https://doi.org/10.48550/arXiv.2403.06833\">https://doi.org/10.48550/arXiv.2403.06833</a>","mla":"Zverev, Egor, et al. “Can LLMs Separate Instructions from Data? And What Do We Even Mean by That?” <i>ArXiv</i>, 2403.06833, 2024, doi:<a href=\"https://doi.org/10.48550/arXiv.2403.06833\">10.48550/arXiv.2403.06833</a>.","short":"E. Zverev, S. Abdelnabi, S. Tabesh, M. Fritz, C. Lampert, ArXiv (2024).","chicago":"Zverev, Egor, Sahar Abdelnabi, Soroush Tabesh, Mario Fritz, and Christoph Lampert. “Can LLMs Separate Instructions from Data? And What Do We Even Mean by That?” <i>ArXiv</i>, 2024. <a href=\"https://doi.org/10.48550/arXiv.2403.06833\">https://doi.org/10.48550/arXiv.2403.06833</a>.","ama":"Zverev E, Abdelnabi S, Tabesh S, Fritz M, Lampert C. Can LLMs separate instructions from data? And what do we even mean by that? <i>arXiv</i>. 2024. doi:<a href=\"https://doi.org/10.48550/arXiv.2403.06833\">10.48550/arXiv.2403.06833</a>"},"oa":1,"date_published":"2024-03-01T00:00:00Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_created":"2025-02-20T10:13:42Z","author":[{"id":"05162b19-1340-11ed-8f02-fa94e0e8c3bc","full_name":"Zverev, Egor","last_name":"Zverev","first_name":"Egor"},{"full_name":"Abdelnabi, Sahar","first_name":"Sahar","last_name":"Abdelnabi"},{"full_name":"Tabesh, Soroush","id":"06000900-6068-11ef-8d61-c2472ef2e752","first_name":"Soroush","orcid":"0009-0003-4119-6281","last_name":"Tabesh"},{"full_name":"Fritz, Mario","first_name":"Mario","last_name":"Fritz"},{"first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887","full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"}],"doi":"10.48550/arXiv.2403.06833","OA_place":"repository","_id":"19063","acknowledgement":"The authors would like to sincerely thank Juan Rocamonde for valuable feedback to our manuscript. We acknowledge the support from the Scientific Service Units (SSU) of ISTA through resources provided by Scientific Computing (SciComp). We thank Dan Alistarh for providing us with computational resources. This work was partially funded by the German Federal Ministry of Education and Research (BMBF) under the grant AIgenCY (16KIS2012) and ELSA – European Lighthouse on Secure and Safe AI funded by the European Union under grant agreement No. 101070617. Views and opinions expressed are however those of the authors only and do not necessarily reflect those of the European Union or European Commission. Neither the European Union nor the European Commission can be held responsible for them."},{"article_processing_charge":"No","publication":"Transactions on Machine Learning Research","department":[{"_id":"ChLa"}],"has_accepted_license":"1","external_id":{"arxiv":["2311.11908"]},"status":"public","ddc":["000"],"file":[{"file_size":1367966,"date_updated":"2025-03-20T09:02:18Z","content_type":"application/pdf","success":1,"relation":"main_file","file_name":"2024_TMLR_Verwimp.pdf","date_created":"2025-03-20T09:02:18Z","file_id":"19426","checksum":"0714e12f7423cd098976ed9974561155","access_level":"open_access","creator":"dernst"}],"date_updated":"2025-03-20T09:21:02Z","language":[{"iso":"eng"}],"day":"12","OA_type":"diamond","publisher":"Transactions on Machine Learning Research","month":"04","type":"journal_article","year":"2024","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png"},"file_date_updated":"2025-03-20T09:02:18Z","quality_controlled":"1","publication_status":"published","title":"Continual learning: Applications and the road forward","oa_version":"Published Version","abstract":[{"lang":"eng","text":"Continual learning is a subfield of machine learning, which aims to allow machine learning models to continuously learn on new data, by accumulating knowledge without forgetting what was learned in the past. In this work, we take a step back, and ask: \"Why should one care about continual learning in the first place?\". We set the stage by examining recent continual learning papers published at four major machine learning conferences, and show that memory-constrained settings dominate the field. Then, we discuss five open problems in machine learning, and even though they might seem unrelated to continual learning at first sight, we show that continual learning will inevitably be part of their solution. These problems are model editing, personalization and specialization, on-device learning, faster (re-)training and reinforcement learning. Finally, by comparing the desiderata from these unsolved problems and the current assumptions in continual learning, we highlight and discuss four future directions for continual learning research. We hope that this work offers an interesting perspective on the future of continual learning, while displaying its potential value and the paths we have to pursue in order to make it successful. This work is the result of the many discussions the authors had at the Dagstuhl seminar on Deep Continual Learning, in March 2023."}],"date_published":"2024-04-12T00:00:00Z","oa":1,"citation":{"ieee":"E. Verwimp <i>et al.</i>, “Continual learning: Applications and the road forward,” <i>Transactions on Machine Learning Research</i>, vol. 2024. Transactions on Machine Learning Research, 2024.","ista":"Verwimp E, Aljundi R, Ben-David S, Bethge M, Cossu A, Gepperth A, Hayes TL, Hüllermeier E, Kanan C, Kudithipudi D, Lampert C, Mundt M, Pascanu R, Popescu A, Tolias AS, Van De Weijer J, Liu B, Lomonaco V, Tuytelaars T, Van De Ven GM. 2024. Continual learning: Applications and the road forward. Transactions on Machine Learning Research. 2024.","apa":"Verwimp, E., Aljundi, R., Ben-David, S., Bethge, M., Cossu, A., Gepperth, A., … Van De Ven, G. M. (2024). Continual learning: Applications and the road forward. <i>Transactions on Machine Learning Research</i>. Transactions on Machine Learning Research.","ama":"Verwimp E, Aljundi R, Ben-David S, et al. Continual learning: Applications and the road forward. <i>Transactions on Machine Learning Research</i>. 2024;2024.","short":"E. Verwimp, R. Aljundi, S. Ben-David, M. Bethge, A. Cossu, A. Gepperth, T.L. Hayes, E. Hüllermeier, C. Kanan, D. Kudithipudi, C. Lampert, M. Mundt, R. Pascanu, A. Popescu, A.S. Tolias, J. Van De Weijer, B. Liu, V. Lomonaco, T. Tuytelaars, G.M. Van De Ven, Transactions on Machine Learning Research 2024 (2024).","mla":"Verwimp, Eli, et al. “Continual Learning: Applications and the Road Forward.” <i>Transactions on Machine Learning Research</i>, vol. 2024, Transactions on Machine Learning Research, 2024.","chicago":"Verwimp, Eli, Rahaf Aljundi, Shai Ben-David, Matthias Bethge, Andrea Cossu, Alexander Gepperth, Tyler L. Hayes, et al. “Continual Learning: Applications and the Road Forward.” <i>Transactions on Machine Learning Research</i>. Transactions on Machine Learning Research, 2024."},"arxiv":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_created":"2025-03-16T23:01:25Z","author":[{"last_name":"Verwimp","first_name":"Eli","full_name":"Verwimp, Eli"},{"full_name":"Aljundi, Rahaf","last_name":"Aljundi","first_name":"Rahaf"},{"first_name":"Shai","last_name":"Ben-David","full_name":"Ben-David, Shai"},{"full_name":"Bethge, Matthias","first_name":"Matthias","last_name":"Bethge"},{"last_name":"Cossu","first_name":"Andrea","full_name":"Cossu, Andrea"},{"last_name":"Gepperth","first_name":"Alexander","full_name":"Gepperth, Alexander"},{"full_name":"Hayes, Tyler L.","last_name":"Hayes","first_name":"Tyler L."},{"last_name":"Hüllermeier","first_name":"Eyke","full_name":"Hüllermeier, Eyke"},{"first_name":"Christopher","last_name":"Kanan","full_name":"Kanan, Christopher"},{"first_name":"Dhireesha","last_name":"Kudithipudi","full_name":"Kudithipudi, Dhireesha"},{"full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert"},{"full_name":"Mundt, Martin","last_name":"Mundt","first_name":"Martin"},{"first_name":"Razvan","last_name":"Pascanu","full_name":"Pascanu, Razvan"},{"first_name":"Adrian","last_name":"Popescu","full_name":"Popescu, Adrian"},{"full_name":"Tolias, Andreas S.","last_name":"Tolias","first_name":"Andreas S."},{"full_name":"Van De Weijer, Joost","first_name":"Joost","last_name":"Van De Weijer"},{"full_name":"Liu, Bing","last_name":"Liu","first_name":"Bing"},{"first_name":"Vincenzo","last_name":"Lomonaco","full_name":"Lomonaco, Vincenzo"},{"full_name":"Tuytelaars, Tinne","last_name":"Tuytelaars","first_name":"Tinne"},{"last_name":"Van De Ven","first_name":"Gido M.","full_name":"Van De Ven, Gido M."}],"_id":"19408","scopus_import":"1","OA_place":"publisher","publication_identifier":{"eissn":["2835-8856"]},"volume":2024,"article_type":"original","alternative_title":["TMLR"],"intvolume":"      2024"},{"intvolume":"       238","publication_identifier":{"eissn":["2640-3498"]},"volume":238,"alternative_title":["PMLR"],"_id":"17093","scopus_import":"1","date_created":"2024-06-02T22:00:57Z","author":[{"id":"653bd8b6-f394-11eb-9cf6-c0bbf6cd78d4","full_name":"Zakerinia, Hossein","last_name":"Zakerinia","first_name":"Hossein"},{"full_name":"Talaei, Shayan","last_name":"Talaei","first_name":"Shayan"},{"orcid":"0000-0001-5634-0731","last_name":"Nadiradze","first_name":"Giorgi","id":"3279A00C-F248-11E8-B48F-1D18A9856A87","full_name":"Nadiradze, Giorgi"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","orcid":"0000-0003-3650-940X","last_name":"Alistarh","first_name":"Dan-Adrian"}],"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","oa":1,"date_published":"2024-05-01T00:00:00Z","arxiv":1,"citation":{"apa":"Zakerinia, H., Talaei, S., Nadiradze, G., &#38; Alistarh, D.-A. (2024). Communication-efficient federated learning with data and client heterogeneity. In <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i> (Vol. 238, pp. 3448–3456). Valencia, Spain: ML Research Press.","mla":"Zakerinia, Hossein, et al. “Communication-Efficient Federated Learning with Data and Client Heterogeneity.” <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, vol. 238, ML Research Press, 2024, pp. 3448–56.","short":"H. Zakerinia, S. Talaei, G. Nadiradze, D.-A. Alistarh, in:, Proceedings of the 27th International Conference on Artificial Intelligence and Statistics, ML Research Press, 2024, pp. 3448–3456.","chicago":"Zakerinia, Hossein, Shayan Talaei, Giorgi Nadiradze, and Dan-Adrian Alistarh. “Communication-Efficient Federated Learning with Data and Client Heterogeneity.” In <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, 238:3448–56. ML Research Press, 2024.","ama":"Zakerinia H, Talaei S, Nadiradze G, Alistarh D-A. Communication-efficient federated learning with data and client heterogeneity. In: <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>. Vol 238. ML Research Press; 2024:3448-3456.","ieee":"H. Zakerinia, S. Talaei, G. Nadiradze, and D.-A. Alistarh, “Communication-efficient federated learning with data and client heterogeneity,” in <i>Proceedings of the 27th International Conference on Artificial Intelligence and Statistics</i>, Valencia, Spain, 2024, vol. 238, pp. 3448–3456.","ista":"Zakerinia H, Talaei S, Nadiradze G, Alistarh D-A. 2024. Communication-efficient federated learning with data and client heterogeneity. Proceedings of the 27th International Conference on Artificial Intelligence and Statistics. AISTATS: Conference on Artificial Intelligence and Statistics, PMLR, vol. 238, 3448–3456."},"abstract":[{"text":"Federated Learning (FL) enables large-scale distributed training of machine learning models, while still allowing individual nodes to maintain data locally. However, executing FL at scale comes with inherent practical challenges: 1) heterogeneity of the local node data distributions, 2) heterogeneity of node computational speeds (asynchrony), but also 3) constraints in the amount of communication between the clients and the server. In this work, we present the first variant of the classic federated averaging (FedAvg) algorithm which, at the same time, supports data heterogeneity, partial client asynchrony, and communication compression. Our algorithm comes with a novel, rigorous analysis showing that, in spite of these system relaxations, it can provide similar convergence to FedAvg in interesting parameter regimes. Experimental results in the rigorous LEAF benchmark on setups of up to 300 nodes show that our algorithm ensures fast convergence for standard federated tasks, improving upon prior quantized and asynchronous approaches.","lang":"eng"}],"title":"Communication-efficient federated learning with data and client heterogeneity","oa_version":"Preprint","publication_status":"published","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2206.10032","open_access":"1"}],"page":"3448-3456","quality_controlled":"1","year":"2024","type":"conference","corr_author":"1","month":"05","conference":{"location":"Valencia, Spain","end_date":"2024-05-04","start_date":"2024-05-02","name":"AISTATS: Conference on Artificial Intelligence and Statistics"},"publisher":"ML Research Press","day":"01","language":[{"iso":"eng"}],"date_updated":"2024-10-09T21:08:57Z","status":"public","external_id":{"arxiv":["2206.10032"]},"publication":"Proceedings of the 27th International Conference on Artificial Intelligence and Statistics","department":[{"_id":"DaAl"},{"_id":"ChLa"}],"article_processing_charge":"No"},{"file_date_updated":"2024-08-12T07:38:06Z","quality_controlled":"1","acknowledged_ssus":[{"_id":"ScienComp"}],"abstract":[{"text":"We present PeFLL, a new personalized federated learning algorithm that improves\r\nover the state-of-the-art in three aspects: 1) it produces more accurate models,\r\nespecially in the low-data regime, and not only for clients present during its\r\ntraining phase, but also for any that may emerge in the future; 2) it reduces the\r\namount of on-client computation and client-server communication by providing\r\nfuture clients with ready-to-use personalized models that require no additional\r\nfinetuning or optimization; 3) it comes with theoretical guarantees that establish\r\ngeneralization from the observed clients to future ones.\r\nAt the core of PeFLL lies a learning-to-learn approach that jointly trains an\r\nembedding network and a hypernetwork. The embedding network is used to\r\nrepresent clients in a latent descriptor space in a way that reflects their similarity\r\nto each other. The hypernetwork takes as input such descriptors and outputs the\r\nparameters of fully personalized client models. In combination, both networks\r\nconstitute a learning algorithm that achieves state-of-the-art performance in several\r\npersonalized federated learning benchmarks","lang":"eng"}],"publication_status":"published","title":"PEFLL: Personalized federated learning by learning to learn","oa_version":"Published Version","author":[{"last_name":"Scott","first_name":"Jonathan A","id":"e499926b-f6e0-11ea-865d-9c63db0031e8","full_name":"Scott, Jonathan A"},{"orcid":"0009-0007-3977-6462","last_name":"Zakerinia","first_name":"Hossein","id":"653bd8b6-f394-11eb-9cf6-c0bbf6cd78d4","full_name":"Zakerinia, Hossein"},{"id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","full_name":"Lampert, Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert","first_name":"Christoph"}],"date_created":"2024-08-11T22:01:12Z","scopus_import":"1","_id":"17411","arxiv":1,"citation":{"ista":"Scott JA, Zakerinia H, Lampert C. 2024. PEFLL: Personalized federated learning by learning to learn. 12th International Conference on Learning Representations. ICLR: International Conference on Learning Representations.","ieee":"J. A. Scott, H. Zakerinia, and C. Lampert, “PEFLL: Personalized federated learning by learning to learn,” in <i>12th International Conference on Learning Representations</i>, Vienna, Austria, 2024.","ama":"Scott JA, Zakerinia H, Lampert C. PEFLL: Personalized federated learning by learning to learn. In: <i>12th International Conference on Learning Representations</i>. OpenReview; 2024.","short":"J.A. Scott, H. Zakerinia, C. Lampert, in:, 12th International Conference on Learning Representations, OpenReview, 2024.","chicago":"Scott, Jonathan A, Hossein Zakerinia, and Christoph Lampert. “PEFLL: Personalized Federated Learning by Learning to Learn.” In <i>12th International Conference on Learning Representations</i>. OpenReview, 2024.","mla":"Scott, Jonathan A., et al. “PEFLL: Personalized Federated Learning by Learning to Learn.” <i>12th International Conference on Learning Representations</i>, OpenReview, 2024.","apa":"Scott, J. A., Zakerinia, H., &#38; Lampert, C. (2024). PEFLL: Personalized federated learning by learning to learn. In <i>12th International Conference on Learning Representations</i>. Vienna, Austria: OpenReview."},"oa":1,"date_published":"2024-03-07T00:00:00Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","acknowledgement":"This research was supported by the Scientific Service Units (SSU) of ISTA through resources provided by Scientific Computing (SciComp).\r\n","external_id":{"arxiv":["2306.05515"]},"status":"public","article_processing_charge":"No","department":[{"_id":"ChLa"}],"has_accepted_license":"1","publication":"12th International Conference on Learning Representations","language":[{"iso":"eng"}],"related_material":{"record":[{"status":"public","id":"21198","relation":"dissertation_contains"}]},"file":[{"success":1,"content_type":"application/pdf","date_updated":"2024-08-12T07:38:06Z","file_size":1029219,"relation":"main_file","creator":"dernst","access_level":"open_access","checksum":"81b7ea2e667adaf9c7a7b6b376b1f251","file_name":"2024_ICLR_Scott.pdf","date_created":"2024-08-12T07:38:06Z","file_id":"17415"}],"ddc":["000"],"date_updated":"2026-04-07T11:46:11Z","publisher":"OpenReview","conference":{"end_date":"2024-03-07","start_date":"2024-03-07","name":"ICLR: International Conference on Learning Representations","location":"Vienna, Austria"},"month":"03","corr_author":"1","day":"07","type":"conference","year":"2024"},{"_id":"18120","scopus_import":"1","author":[{"full_name":"Scott, Jonathan A","id":"e499926b-f6e0-11ea-865d-9c63db0031e8","first_name":"Jonathan A","last_name":"Scott"},{"last_name":"Cahill","first_name":"Áine","full_name":"Cahill, Áine"}],"date_created":"2024-09-22T22:01:45Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_published":"2024-09-01T00:00:00Z","oa":1,"citation":{"short":"J.A. Scott, Á. Cahill, in:, Proceedings of the 41st International Conference on Machine Learning, ML Research Press, 2024, pp. 44012–44037.","chicago":"Scott, Jonathan A, and Áine Cahill. “Improved Modelling of Federated Datasets Using Mixtures-of-Dirichlet-Multinomials.” In <i>Proceedings of the 41st International Conference on Machine Learning</i>, 235:44012–37. ML Research Press, 2024.","mla":"Scott, Jonathan A., and Áine Cahill. “Improved Modelling of Federated Datasets Using Mixtures-of-Dirichlet-Multinomials.” <i>Proceedings of the 41st International Conference on Machine Learning</i>, vol. 235, ML Research Press, 2024, pp. 44012–37.","ama":"Scott JA, Cahill Á. Improved modelling of federated datasets using mixtures-of-Dirichlet-multinomials. In: <i>Proceedings of the 41st International Conference on Machine Learning</i>. Vol 235. ML Research Press; 2024:44012-44037.","apa":"Scott, J. A., &#38; Cahill, Á. (2024). Improved modelling of federated datasets using mixtures-of-Dirichlet-multinomials. In <i>Proceedings of the 41st International Conference on Machine Learning</i> (Vol. 235, pp. 44012–44037). Vienna, Austria: ML Research Press.","ista":"Scott JA, Cahill Á. 2024. Improved modelling of federated datasets using mixtures-of-Dirichlet-multinomials. Proceedings of the 41st International Conference on Machine Learning. ICML: International Conference on Machine Learning, PMLR, vol. 235, 44012–44037.","ieee":"J. A. Scott and Á. Cahill, “Improved modelling of federated datasets using mixtures-of-Dirichlet-multinomials,” in <i>Proceedings of the 41st International Conference on Machine Learning</i>, Vienna, Austria, 2024, vol. 235, pp. 44012–44037."},"arxiv":1,"intvolume":"       235","acknowledgement":"We would like to thank: Mona Chitnis and everyone in the Private Federated Learning team at Apple for their help and support throughout the entire project; Audra McMillan, Martin Pelikan, Anosh Raj and Barry Theobold for feedback on the initial versions of the paper; and Christoph Lampert for valuable feedback on the paper structure and suggestions for additional experiments.","volume":235,"publication_identifier":{"eissn":["2640-3498"]},"alternative_title":["PMLR"],"main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2406.02416","open_access":"1"}],"page":"44012-44037","quality_controlled":"1","abstract":[{"text":"In practice, training using federated learning can be orders of magnitude slower than standard centralized training. This severely limits the amount of experimentation and tuning that can be done, making it challenging to obtain good performance on a given task. Server-side proxy data can be used to run training simulations, for instance for hyperparameter tuning. This can greatly speed up the training pipeline by reducing the number of tuning runs to be performed overall on the true clients. However, it is challenging to ensure that these simulations accurately reflect the dynamics of the real federated training. In particular, the proxy data used for simulations often comes as a single centralized dataset without a partition into distinct clients, and partitioning this data in a naive way can lead to simulations that poorly reflect real federated training. In this paper we address the challenge of how to partition centralized data in a way that reflects the statistical heterogeneity of the true federated clients. We propose a fully federated, theoretically justified, algorithm that efficiently learns the distribution of the true clients and observe improved server-side simulations when using the inferred distribution to create simulated clients from the centralized data.","lang":"eng"}],"title":"Improved modelling of federated datasets using mixtures-of-Dirichlet-multinomials","oa_version":"Preprint","publication_status":"published","corr_author":"1","month":"09","conference":{"end_date":"2024-07-27","start_date":"2024-07-21","name":"ICML: International Conference on Machine Learning","location":"Vienna, Austria"},"publisher":"ML Research Press","day":"01","year":"2024","type":"conference","status":"public","external_id":{"arxiv":["2406.02416"]},"publication":"Proceedings of the 41st International Conference on Machine Learning","department":[{"_id":"ChLa"}],"article_processing_charge":"No","related_material":{"record":[{"relation":"dissertation_contains","id":"21198","status":"public"}]},"language":[{"iso":"eng"}],"date_updated":"2026-04-07T11:46:11Z"},{"month":"06","corr_author":"1","publisher":"Computer Vision Foundation","conference":{"end_date":"2024-06-22","name":"CVPR: Conference on Computer Vision and Pattern Recognition","start_date":"2024-06-16","location":"Seattle, WA, United States"},"OA_type":"green","day":"01","year":"2024","type":"conference","status":"public","external_id":{"isi":["001344387500055"],"arxiv":["2311.16833"]},"isi":1,"department":[{"_id":"GradSch"},{"_id":"ChLa"}],"has_accepted_license":"1","publication":"Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition","article_processing_charge":"No","language":[{"iso":"eng"}],"related_material":{"record":[{"id":"19759","status":"public","relation":"dissertation_contains"}],"link":[{"url":"https://github.com/berndprach/1LipschitzLayersCompared","relation":"software"}]},"date_updated":"2026-04-07T11:49:51Z","OA_place":"repository","_id":"17426","author":[{"full_name":"Prach, Bernd","id":"2D561D42-C427-11E9-89B4-9C1AE6697425","first_name":"Bernd","last_name":"Prach"},{"first_name":"Fabio","last_name":"Brau","full_name":"Brau, Fabio"},{"full_name":"Buttazzo, Giorgio","first_name":"Giorgio","last_name":"Buttazzo"},{"full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887"}],"date_created":"2024-08-14T08:42:32Z","doi":"10.1109/CVPR52733.2024.02320","user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","citation":{"chicago":"Prach, Bernd, Fabio Brau, Giorgio Buttazzo, and Christoph Lampert. “1-Lipschitz Layers Compared: Memory, Speed, and Certifiable Robustness.” In <i>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, 24574–83. Computer Vision Foundation, 2024. <a href=\"https://doi.org/10.1109/CVPR52733.2024.02320\">https://doi.org/10.1109/CVPR52733.2024.02320</a>.","short":"B. Prach, F. Brau, G. Buttazzo, C. Lampert, in:, Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, Computer Vision Foundation, 2024, pp. 24574–24583.","mla":"Prach, Bernd, et al. “1-Lipschitz Layers Compared: Memory, Speed, and Certifiable Robustness.” <i>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, Computer Vision Foundation, 2024, pp. 24574–83, doi:<a href=\"https://doi.org/10.1109/CVPR52733.2024.02320\">10.1109/CVPR52733.2024.02320</a>.","ama":"Prach B, Brau F, Buttazzo G, Lampert C. 1-Lipschitz layers compared: Memory, speed, and certifiable robustness. In: <i>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>. Computer Vision Foundation; 2024:24574-24583. doi:<a href=\"https://doi.org/10.1109/CVPR52733.2024.02320\">10.1109/CVPR52733.2024.02320</a>","apa":"Prach, B., Brau, F., Buttazzo, G., &#38; Lampert, C. (2024). 1-Lipschitz layers compared: Memory, speed, and certifiable robustness. In <i>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</i> (pp. 24574–24583). Seattle, WA, United States: Computer Vision Foundation. <a href=\"https://doi.org/10.1109/CVPR52733.2024.02320\">https://doi.org/10.1109/CVPR52733.2024.02320</a>","ista":"Prach B, Brau F, Buttazzo G, Lampert C. 2024. 1-Lipschitz layers compared: Memory, speed, and certifiable robustness. Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. CVPR: Conference on Computer Vision and Pattern Recognition, 24574–24583.","ieee":"B. Prach, F. Brau, G. Buttazzo, and C. Lampert, “1-Lipschitz layers compared: Memory, speed, and certifiable robustness,” in <i>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, Seattle, WA, United States, 2024, pp. 24574–24583."},"arxiv":1,"oa":1,"date_published":"2024-06-01T00:00:00Z","acknowledgement":"This work was partially supported by project SERICS (PE00000014) under the MUR National Recovery and Resilience Plan funded by the European Union - NextGenerationEU.\r\n","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2311.16833"}],"page":"24574-24583","quality_controlled":"1","abstract":[{"lang":"eng","text":"The robustness of neural networks against input perturbations with bounded\r\nmagnitude represents a serious concern in the deployment of deep learning\r\nmodels in safety-critical systems. Recently, the scientific community has\r\nfocused on enhancing certifiable robustness guarantees by crafting 1-Lipschitz\r\nneural networks that leverage Lipschitz bounded dense and convolutional layers.\r\nAlthough different methods have been proposed in the literature to achieve this\r\ngoal, understanding the performance of such methods is not straightforward,\r\nsince different metrics can be relevant (e.g., training time, memory usage,\r\naccuracy, certifiable robustness) for different applications. For this reason,\r\nthis work provides a thorough theoretical and empirical comparison between\r\nmethods by evaluating them in terms of memory usage, speed, and certifiable\r\nrobust accuracy. The paper also provides some guidelines and recommendations to\r\nsupport the user in selecting the methods that work best depending on the\r\navailable resources. We provide code at\r\nhttps://github.com/berndprach/1LipschitzLayersCompared."}],"title":"1-Lipschitz layers compared: Memory, speed, and certifiable robustness","oa_version":"Preprint","publication_status":"published"},{"date_published":"2024-12-05T00:00:00Z","oa":1,"citation":{"apa":"Prach, B., &#38; Lampert, C. (n.d.). Intriguing properties of robust classification. <i>arXiv</i>. <a href=\"https://doi.org/10.48550/arXiv.2412.04245\">https://doi.org/10.48550/arXiv.2412.04245</a>","ama":"Prach B, Lampert C. Intriguing properties of robust classification. <i>arXiv</i>. doi:<a href=\"https://doi.org/10.48550/arXiv.2412.04245\">10.48550/arXiv.2412.04245</a>","chicago":"Prach, Bernd, and Christoph Lampert. “Intriguing Properties of Robust Classification.” <i>ArXiv</i>, n.d. <a href=\"https://doi.org/10.48550/arXiv.2412.04245\">https://doi.org/10.48550/arXiv.2412.04245</a>.","mla":"Prach, Bernd, and Christoph Lampert. “Intriguing Properties of Robust Classification.” <i>ArXiv</i>, 2412.04245, doi:<a href=\"https://doi.org/10.48550/arXiv.2412.04245\">10.48550/arXiv.2412.04245</a>.","short":"B. Prach, C. Lampert, ArXiv (n.d.).","ieee":"B. Prach and C. Lampert, “Intriguing properties of robust classification,” <i>arXiv</i>. .","ista":"Prach B, Lampert C. Intriguing properties of robust classification. arXiv, 2412.04245."},"arxiv":1,"day":"05","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","doi":"10.48550/arXiv.2412.04245","author":[{"id":"2D561D42-C427-11E9-89B4-9C1AE6697425","full_name":"Prach, Bernd","last_name":"Prach","first_name":"Bernd"},{"full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887"}],"date_created":"2025-01-24T16:57:29Z","_id":"18874","corr_author":"1","OA_place":"repository","month":"12","type":"preprint","year":"2024","article_processing_charge":"No","publication":"arXiv","department":[{"_id":"GradSch"},{"_id":"ChLa"}],"external_id":{"arxiv":["2412.04245"]},"article_number":"2412.04245","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2412.04245","open_access":"1"}],"status":"public","publication_status":"draft","date_updated":"2026-04-07T11:49:51Z","oa_version":"Preprint","title":"Intriguing properties of robust classification","related_material":{"record":[{"status":"public","id":"20455","relation":"later_version"},{"status":"public","id":"19759","relation":"dissertation_contains"}]},"language":[{"iso":"eng"}],"abstract":[{"text":"Despite extensive research since the community learned about adversarial\r\nexamples 10 years ago, we still do not know how to train high-accuracy\r\nclassifiers that are guaranteed to be robust to small perturbations of their\r\ninputs. Previous works often argued that this might be because no classifier\r\nexists that is robust and accurate at the same time. However, in computer\r\nvision this assumption does not match reality where humans are usually accurate\r\nand robust on most tasks of interest. We offer an alternative explanation and\r\nshow that in certain settings robust generalization is only possible with\r\nunrealistically large amounts of data. More precisely we find a setting where a\r\nrobust classifier exists, it is easy to learn an accurate classifier, yet it\r\nrequires an exponential amount of data to learn a robust classifier. Based on\r\nthis theoretical result, we explore how well robust classifiers generalize on\r\ndatasets such as CIFAR-10. We come to the conclusion that on this datasets, the\r\nlimitation of current robust models also lies in the generalization, and that\r\nthey require a lot of data to do well on the test set. We also show that the\r\nproblem is not in the expressiveness or generalization capabilities of current\r\narchitectures, and that there are low magnitude features in the data which are\r\nuseful for non-robust generalization but are not available for robust\r\nclassifiers.","lang":"eng"}]},{"article_type":"original","publication_identifier":{"issn":["2469-9950"],"eissn":["2469-9969"]},"volume":108,"acknowledgement":"A.F.Y. acknowledges primary support from the Department of Energy under award DE-SC0020043, and additional support from the Gordon and Betty Moore Foundation under award GBMF9471 for group operations.","intvolume":"       108","arxiv":1,"citation":{"chicago":"Henderson, Paul M, Areg Ghazaryan, Alexander A. Zibrov, Andrea F. Young, and Maksym Serbyn. “Deep Learning Extraction of Band Structure Parameters from Density of States: A Case Study on Trilayer Graphene.” <i>Physical Review B</i>. American Physical Society, 2023. <a href=\"https://doi.org/10.1103/physrevb.108.125411\">https://doi.org/10.1103/physrevb.108.125411</a>.","short":"P.M. Henderson, A. Ghazaryan, A.A. Zibrov, A.F. Young, M. Serbyn, Physical Review B 108 (2023).","mla":"Henderson, Paul M., et al. “Deep Learning Extraction of Band Structure Parameters from Density of States: A Case Study on Trilayer Graphene.” <i>Physical Review B</i>, vol. 108, no. 12, 125411, American Physical Society, 2023, doi:<a href=\"https://doi.org/10.1103/physrevb.108.125411\">10.1103/physrevb.108.125411</a>.","ama":"Henderson PM, Ghazaryan A, Zibrov AA, Young AF, Serbyn M. Deep learning extraction of band structure parameters from density of states: A case study on trilayer graphene. <i>Physical Review B</i>. 2023;108(12). doi:<a href=\"https://doi.org/10.1103/physrevb.108.125411\">10.1103/physrevb.108.125411</a>","apa":"Henderson, P. M., Ghazaryan, A., Zibrov, A. A., Young, A. F., &#38; Serbyn, M. (2023). Deep learning extraction of band structure parameters from density of states: A case study on trilayer graphene. <i>Physical Review B</i>. American Physical Society. <a href=\"https://doi.org/10.1103/physrevb.108.125411\">https://doi.org/10.1103/physrevb.108.125411</a>","ista":"Henderson PM, Ghazaryan A, Zibrov AA, Young AF, Serbyn M. 2023. Deep learning extraction of band structure parameters from density of states: A case study on trilayer graphene. Physical Review B. 108(12), 125411.","ieee":"P. M. Henderson, A. Ghazaryan, A. A. Zibrov, A. F. Young, and M. Serbyn, “Deep learning extraction of band structure parameters from density of states: A case study on trilayer graphene,” <i>Physical Review B</i>, vol. 108, no. 12. American Physical Society, 2023."},"oa":1,"date_published":"2023-09-15T00:00:00Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","date_created":"2023-09-12T07:12:12Z","doi":"10.1103/physrevb.108.125411","author":[{"last_name":"Henderson","orcid":"0000-0002-5198-7445","first_name":"Paul M","id":"13C09E74-18D9-11E9-8878-32CFE5697425","full_name":"Henderson, Paul M"},{"full_name":"Ghazaryan, Areg","id":"4AF46FD6-F248-11E8-B48F-1D18A9856A87","first_name":"Areg","orcid":"0000-0001-9666-3543","last_name":"Ghazaryan"},{"full_name":"Zibrov, Alexander A.","first_name":"Alexander A.","last_name":"Zibrov"},{"full_name":"Young, Andrea F.","last_name":"Young","first_name":"Andrea F."},{"id":"47809E7E-F248-11E8-B48F-1D18A9856A87","full_name":"Serbyn, Maksym","last_name":"Serbyn","orcid":"0000-0002-2399-5827","first_name":"Maksym"}],"scopus_import":"1","_id":"14320","publication_status":"published","oa_version":"Preprint","title":"Deep learning extraction of band structure parameters from density of states: A case study on trilayer graphene","abstract":[{"lang":"eng","text":"The development of two-dimensional materials has resulted in a diverse range of novel, high-quality compounds with increasing complexity. A key requirement for a comprehensive quantitative theory is the accurate determination of these materials' band structure parameters. However, this task is challenging due to the intricate band structures and the indirect nature of experimental probes. In this work, we introduce a general framework to derive band structure parameters from experimental data using deep neural networks. We applied our method to the penetration field capacitance measurement of trilayer graphene, an effective probe of its density of states. First, we demonstrate that a trained deep network gives accurate predictions for the penetration field capacitance as a function of tight-binding parameters. Next, we use the fast and accurate predictions from the trained network to automatically determine tight-binding parameters directly from experimental data, with extracted parameters being in a good agreement with values in the literature. We conclude by discussing potential applications of our method to other materials and experimental techniques beyond penetration field capacitance."}],"quality_controlled":"1","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2210.06310","open_access":"1"}],"type":"journal_article","year":"2023","day":"15","issue":"12","publisher":"American Physical Society","month":"09","date_updated":"2023-09-20T09:38:24Z","language":[{"iso":"eng"}],"article_processing_charge":"No","department":[{"_id":"MaSe"},{"_id":"ChLa"},{"_id":"MiLe"}],"publication":"Physical Review B","article_number":"125411","external_id":{"arxiv":["2210.06310"]},"status":"public"},{"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","day":"20","citation":{"ieee":"P. Tomaszewska and C. Lampert, “On the implementation of baselines and lightweight conditional model extrapolation (LIMES) under class-prior shift,” in <i>International Workshop on Reproducible Research in Pattern Recognition</i>, Montreal, Canada, 2023, vol. 14068, pp. 67–73.","ista":"Tomaszewska P, Lampert C. 2023. On the implementation of baselines and lightweight conditional model extrapolation (LIMES) under class-prior shift. International Workshop on Reproducible Research in Pattern Recognition. RRPR: Reproducible Research in Pattern Recognition, LNCS, vol. 14068, 67–73.","apa":"Tomaszewska, P., &#38; Lampert, C. (2023). On the implementation of baselines and lightweight conditional model extrapolation (LIMES) under class-prior shift. In <i>International Workshop on Reproducible Research in Pattern Recognition</i> (Vol. 14068, pp. 67–73). Montreal, Canada: Springer Nature. <a href=\"https://doi.org/10.1007/978-3-031-40773-4_6\">https://doi.org/10.1007/978-3-031-40773-4_6</a>","short":"P. Tomaszewska, C. Lampert, in:, International Workshop on Reproducible Research in Pattern Recognition, Springer Nature, 2023, pp. 67–73.","mla":"Tomaszewska, Paulina, and Christoph Lampert. “On the Implementation of Baselines and Lightweight Conditional Model Extrapolation (LIMES) under Class-Prior Shift.” <i>International Workshop on Reproducible Research in Pattern Recognition</i>, vol. 14068, Springer Nature, 2023, pp. 67–73, doi:<a href=\"https://doi.org/10.1007/978-3-031-40773-4_6\">10.1007/978-3-031-40773-4_6</a>.","chicago":"Tomaszewska, Paulina, and Christoph Lampert. “On the Implementation of Baselines and Lightweight Conditional Model Extrapolation (LIMES) under Class-Prior Shift.” In <i>International Workshop on Reproducible Research in Pattern Recognition</i>, 14068:67–73. Springer Nature, 2023. <a href=\"https://doi.org/10.1007/978-3-031-40773-4_6\">https://doi.org/10.1007/978-3-031-40773-4_6</a>.","ama":"Tomaszewska P, Lampert C. On the implementation of baselines and lightweight conditional model extrapolation (LIMES) under class-prior shift. In: <i>International Workshop on Reproducible Research in Pattern Recognition</i>. Vol 14068. Springer Nature; 2023:67-73. doi:<a href=\"https://doi.org/10.1007/978-3-031-40773-4_6\">10.1007/978-3-031-40773-4_6</a>"},"date_published":"2023-08-20T00:00:00Z","month":"08","scopus_import":"1","_id":"14410","publisher":"Springer Nature","date_created":"2023-10-08T22:01:18Z","doi":"10.1007/978-3-031-40773-4_6","conference":{"start_date":"2022-08-21","name":"RRPR: Reproducible Research in Pattern Recognition","end_date":"2022-08-21","location":"Montreal, Canada"},"author":[{"last_name":"Tomaszewska","first_name":"Paulina","full_name":"Tomaszewska, Paulina"},{"full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert"}],"year":"2023","alternative_title":["LNCS"],"publication_identifier":{"isbn":["9783031407727"],"issn":["0302-9743"],"eissn":["1611-3349"]},"volume":14068,"type":"conference","intvolume":"     14068","quality_controlled":"1","department":[{"_id":"ChLa"}],"publication":"International Workshop on Reproducible Research in Pattern Recognition","article_processing_charge":"No","status":"public","page":"67-73","title":"On the implementation of baselines and lightweight conditional model extrapolation (LIMES) under class-prior shift","oa_version":"None","date_updated":"2023-10-09T06:48:02Z","publication_status":"published","abstract":[{"lang":"eng","text":"This paper focuses on the implementation details of the baseline methods and a recent lightweight conditional model extrapolation algorithm LIMES [5] for streaming data under class-prior shift. LIMES achieves superior performance over the baseline methods, especially concerning the minimum-across-day accuracy, which is important for the users of the system. In this work, the key measures to facilitate reproducibility and enhance the credibility of the results are described."}],"language":[{"iso":"eng"}]},{"year":"2023","type":"journal_article","tmp":{"legal_code_url":"https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode","image":"/images/cc_by_nc_nd.png","name":"Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)","short":"CC BY-NC-ND (4.0)"},"day":"01","issue":"4","license":"https://creativecommons.org/licenses/by-nc-nd/4.0/","month":"08","publisher":"Sciendo","date_updated":"2025-09-09T13:10:30Z","ddc":["510"],"file":[{"date_created":"2023-10-31T12:07:23Z","file_id":"14476","checksum":"b069cc10fa6a7c96b2bc9f728165f9e6","file_name":"2023_MeasurementScienceRev_Jakubik.pdf","access_level":"open_access","creator":"dernst","relation":"main_file","date_updated":"2023-10-31T12:07:23Z","file_size":2639783,"content_type":"application/pdf","success":1}],"language":[{"iso":"eng"}],"publication":"Measurement Science Review","isi":1,"department":[{"_id":"ChLa"}],"has_accepted_license":"1","article_processing_charge":"Yes","status":"public","external_id":{"isi":["001070829600005"]},"volume":23,"publication_identifier":{"eissn":["1335-8871"]},"article_type":"original","intvolume":"        23","acknowledgement":"The work was supported by the Scientific Grant Agency of the Ministry of Education of the Slovak Republic and the Slovak Academy of Sciences, projects APVV-21-0216, VEGA2-0096-21 and VEGA 2-0023-22.","user_id":"317138e5-6ab7-11ef-aa6d-ffef3953e345","oa":1,"date_published":"2023-08-01T00:00:00Z","citation":{"apa":"Jakubík, J., Phuong, M., Chvosteková, M., &#38; Krakovská, A. (2023). Against the flow of time with multi-output models. <i>Measurement Science Review</i>. Sciendo. <a href=\"https://doi.org/10.2478/msr-2023-0023\">https://doi.org/10.2478/msr-2023-0023</a>","short":"J. Jakubík, M. Phuong, M. Chvosteková, A. Krakovská, Measurement Science Review 23 (2023) 175–183.","mla":"Jakubík, Jozef, et al. “Against the Flow of Time with Multi-Output Models.” <i>Measurement Science Review</i>, vol. 23, no. 4, Sciendo, 2023, pp. 175–83, doi:<a href=\"https://doi.org/10.2478/msr-2023-0023\">10.2478/msr-2023-0023</a>.","chicago":"Jakubík, Jozef, Mary Phuong, Martina Chvosteková, and Anna Krakovská. “Against the Flow of Time with Multi-Output Models.” <i>Measurement Science Review</i>. Sciendo, 2023. <a href=\"https://doi.org/10.2478/msr-2023-0023\">https://doi.org/10.2478/msr-2023-0023</a>.","ama":"Jakubík J, Phuong M, Chvosteková M, Krakovská A. Against the flow of time with multi-output models. <i>Measurement Science Review</i>. 2023;23(4):175-183. doi:<a href=\"https://doi.org/10.2478/msr-2023-0023\">10.2478/msr-2023-0023</a>","ieee":"J. Jakubík, M. Phuong, M. Chvosteková, and A. Krakovská, “Against the flow of time with multi-output models,” <i>Measurement Science Review</i>, vol. 23, no. 4. Sciendo, pp. 175–183, 2023.","ista":"Jakubík J, Phuong M, Chvosteková M, Krakovská A. 2023. Against the flow of time with multi-output models. Measurement Science Review. 23(4), 175–183."},"_id":"14446","scopus_import":"1","date_created":"2023-10-22T22:01:15Z","doi":"10.2478/msr-2023-0023","author":[{"last_name":"Jakubík","first_name":"Jozef","full_name":"Jakubík, Jozef"},{"id":"3EC6EE64-F248-11E8-B48F-1D18A9856A87","full_name":"Bui Thi Mai, Phuong","last_name":"Bui Thi Mai","first_name":"Phuong"},{"full_name":"Chvosteková, Martina","first_name":"Martina","last_name":"Chvosteková"},{"full_name":"Krakovská, Anna","first_name":"Anna","last_name":"Krakovská"}],"title":"Against the flow of time with multi-output models","oa_version":"Published Version","publication_status":"published","abstract":[{"lang":"eng","text":"Recent work has paid close attention to the first principle of Granger causality, according to which cause precedes effect. In this context, the question may arise whether the detected direction of causality also reverses after the time reversal of unidirectionally coupled data. Recently, it has been shown that for unidirectionally causally connected autoregressive (AR) processes X → Y, after time reversal of data, the opposite causal direction Y → X is indeed detected, although typically as part of the bidirectional X↔ Y link. As we argue here, the answer is different when the measured data are not from AR processes but from linked deterministic systems. When the goal is the usual forward data analysis, cross-mapping-like approaches correctly detect X → Y, while Granger causality-like approaches, which should not be used for deterministic time series, detect causal independence X → Y. The results of backward causal analysis depend on the predictability of the reversed data. Unlike AR processes, observables from deterministic dynamical systems, even complex nonlinear ones, can be predicted well forward, while backward predictions can be difficult (notably when the time reversal of a function leads to one-to-many relations). To address this problem, we propose an approach based on models that provide multiple candidate predictions for the target, combined with a loss function that consideres only the best candidate. The resulting good forward and backward predictability supports the view that unidirectionally causally linked deterministic dynamical systems X → Y can be expected to detect the same link both before and after time reversal."}],"file_date_updated":"2023-10-31T12:07:23Z","quality_controlled":"1","page":"175-183"},{"abstract":[{"text":"We present Cross-Client Label Propagation(XCLP), a new method for transductive federated learning. XCLP estimates a data graph jointly from the data of multiple clients and computes labels for the unlabeled data by propagating label information across the graph. To avoid clients having to share their data with anyone, XCLP employs two cryptographically secure protocols: secure Hamming distance computation and secure summation. We demonstrate two distinct applications of XCLP within federated learning. In the first, we use it in a one-shot way to predict labels for unseen test points. In the second, we use it to repeatedly pseudo-label unlabeled training data in a federated semi-supervised setting. Experiments on both real federated and standard benchmark datasets show that in both applications XCLP achieves higher classification accuracy than alternative approaches.","lang":"eng"}],"publication_status":"published","title":"Cross-client label propagation for transductive and semi-supervised federated learning","oa_version":"Preprint","file_date_updated":"2025-02-04T08:30:05Z","quality_controlled":"1","alternative_title":["TMLR"],"publication_identifier":{"issn":["2835-8856"]},"author":[{"first_name":"Jonathan A","last_name":"Scott","full_name":"Scott, Jonathan A","id":"e499926b-f6e0-11ea-865d-9c63db0031e8"},{"orcid":"0009-0001-3676-4809","last_name":"Yeo","first_name":"Michelle X","id":"2D82B818-F248-11E8-B48F-1D18A9856A87","full_name":"Yeo, Michelle X"},{"first_name":"Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert","full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"}],"date_created":"2023-02-20T08:21:50Z","OA_place":"repository","_id":"12660","citation":{"ieee":"J. A. Scott, M. X. Yeo, and C. Lampert, “Cross-client label propagation for transductive and semi-supervised federated learning,” in <i>Transactions in Machine Learning</i>, 2023.","ista":"Scott JA, Yeo MX, Lampert C. 2023. Cross-client label propagation for transductive and semi-supervised federated learning. Transactions in Machine Learning. , TMLR, .","apa":"Scott, J. A., Yeo, M. X., &#38; Lampert, C. (2023). Cross-client label propagation for transductive and semi-supervised federated learning. In <i>Transactions in Machine Learning</i>. Curran Associates.","mla":"Scott, Jonathan A., et al. “Cross-Client Label Propagation for Transductive and Semi-Supervised Federated Learning.” <i>Transactions in Machine Learning</i>, Curran Associates, 2023.","short":"J.A. Scott, M.X. Yeo, C. Lampert, in:, Transactions in Machine Learning, Curran Associates, 2023.","chicago":"Scott, Jonathan A, Michelle X Yeo, and Christoph Lampert. “Cross-Client Label Propagation for Transductive and Semi-Supervised Federated Learning.” In <i>Transactions in Machine Learning</i>. Curran Associates, 2023.","ama":"Scott JA, Yeo MX, Lampert C. Cross-client label propagation for transductive and semi-supervised federated learning. In: <i>Transactions in Machine Learning</i>. Curran Associates; 2023."},"arxiv":1,"date_published":"2023-11-27T00:00:00Z","oa":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","language":[{"iso":"eng"}],"related_material":{"link":[{"url":"https://github.com/jonnyascott/xclp","relation":"software"}]},"date_updated":"2025-02-04T08:32:19Z","ddc":["004"],"file":[{"relation":"main_file","creator":"dernst","access_level":"open_access","checksum":"aa322ad91cbd229f5cafe6733a119bd1","file_name":"2023_TMLR_Scott.pdf","file_id":"18990","date_created":"2025-02-04T08:30:05Z","success":1,"content_type":"application/pdf","file_size":553717,"date_updated":"2025-02-04T08:30:05Z"}],"external_id":{"arxiv":["2210.06434"]},"status":"public","article_processing_charge":"No","has_accepted_license":"1","department":[{"_id":"ChLa"}],"publication":"Transactions in Machine Learning","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png"},"type":"conference","year":"2023","publisher":"Curran Associates","month":"11","corr_author":"1","OA_type":"green","day":"27"},{"external_id":{"arxiv":["2305.13165"]},"status":"public","main_file_link":[{"open_access":"1","url":" https://doi.org/10.48550/arXiv.2305.13165"}],"article_processing_charge":"No","quality_controlled":"1","department":[{"_id":"MaMo"},{"_id":"ChLa"}],"publication":"37th Annual Conference on Neural Information Processing Systems","language":[{"iso":"eng"}],"abstract":[{"text":"Neural collapse (NC) refers to the surprising structure of the last layer of deep neural networks in the terminal phase of gradient descent training. Recently, an increasing amount of experimental evidence has pointed to the propagation of NC to earlier layers of neural networks. However, while the NC in the last layer is well studied theoretically, much less is known about its multi-layered counterpart - deep neural collapse (DNC). In particular, existing work focuses either on linear layers or only on the last two layers at the price of an extra assumption. Our paper fills this gap by generalizing the established analytical framework for NC - the unconstrained features model - to multiple non-linear layers. Our key technical contribution is to show that, in a deep unconstrained features model, the unique global optimum for binary classification exhibits all the properties typical of DNC. This explains the existing experimental evidence of DNC. We also empirically show that (i) by optimizing deep unconstrained features models via gradient descent, the resulting solution agrees well with our theory, and (ii) trained networks recover the unconstrained features suitable for the occurrence of DNC, thus supporting the validity of this modeling principle.","lang":"eng"}],"publication_status":"published","date_updated":"2025-04-15T07:50:16Z","title":"Deep neural collapse is provably optimal for the deep unconstrained features model","oa_version":"Preprint","author":[{"last_name":"Súkeník","first_name":"Peter","id":"d64d6a8d-eb8e-11eb-b029-96fd216dec3c","full_name":"Súkeník, Peter"},{"id":"27EB676C-8706-11E9-9510-7717E6697425","full_name":"Mondelli, Marco","orcid":"0000-0002-3242-7020","last_name":"Mondelli","first_name":"Marco"},{"full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87","first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887"}],"date_created":"2024-02-02T11:17:41Z","conference":{"end_date":"2023-12-16","start_date":"2023-12-10","name":"NeurIPS: Neural Information Processing Systems","location":"New Orleans, LA, United States"},"month":"12","_id":"14921","corr_author":"1","citation":{"mla":"Súkeník, Peter, et al. “Deep Neural Collapse Is Provably Optimal for the Deep Unconstrained Features Model.” <i>37th Annual Conference on Neural Information Processing Systems</i>, 2023.","chicago":"Súkeník, Peter, Marco Mondelli, and Christoph Lampert. “Deep Neural Collapse Is Provably Optimal for the Deep Unconstrained Features Model.” In <i>37th Annual Conference on Neural Information Processing Systems</i>, 2023.","short":"P. Súkeník, M. Mondelli, C. Lampert, in:, 37th Annual Conference on Neural Information Processing Systems, 2023.","ama":"Súkeník P, Mondelli M, Lampert C. Deep neural collapse is provably optimal for the deep unconstrained features model. In: <i>37th Annual Conference on Neural Information Processing Systems</i>. ; 2023.","apa":"Súkeník, P., Mondelli, M., &#38; Lampert, C. (2023). Deep neural collapse is provably optimal for the deep unconstrained features model. In <i>37th Annual Conference on Neural Information Processing Systems</i>. New Orleans, LA, United States.","ista":"Súkeník P, Mondelli M, Lampert C. 2023. Deep neural collapse is provably optimal for the deep unconstrained features model. 37th Annual Conference on Neural Information Processing Systems. NeurIPS: Neural Information Processing Systems, NeurIPS, .","ieee":"P. Súkeník, M. Mondelli, and C. Lampert, “Deep neural collapse is provably optimal for the deep unconstrained features model,” in <i>37th Annual Conference on Neural Information Processing Systems</i>, New Orleans, LA, United States, 2023."},"arxiv":1,"date_published":"2023-12-15T00:00:00Z","oa":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","day":"15","acknowledgement":"M. M. is partially supported by the 2019 Lopez-Loreta Prize. The authors would like to thank Eugenia Iofinova, Bernd Prach and Simone Bombari for valuable feedback on the manuscript.","project":[{"_id":"059876FA-7A3F-11EA-A408-12923DDC885E","name":"Prix Lopez-Loretta 2019 - Marco Mondelli"}],"type":"conference","year":"2023","alternative_title":["NeurIPS"]},{"corr_author":"1","_id":"15039","month":"11","OA_place":"repository","author":[{"full_name":"Prach, Bernd","id":"2D561D42-C427-11E9-89B4-9C1AE6697425","first_name":"Bernd","last_name":"Prach"},{"first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887","full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"}],"doi":"10.48550/ARXIV.2311.06103","date_created":"2024-02-28T17:59:32Z","day":"10","user_id":"8b945eb4-e2f2-11eb-945a-df72226e66a9","date_published":"2023-11-10T00:00:00Z","oa":1,"arxiv":1,"citation":{"mla":"Prach, Bernd, and Christoph Lampert. “1-Lipschitz Neural Networks Are More Expressive with N-Activations.” <i>ArXiv</i>, 2311.06103, doi:<a href=\"https://doi.org/10.48550/ARXIV.2311.06103\">10.48550/ARXIV.2311.06103</a>.","chicago":"Prach, Bernd, and Christoph Lampert. “1-Lipschitz Neural Networks Are More Expressive with N-Activations.” <i>ArXiv</i>, n.d. <a href=\"https://doi.org/10.48550/ARXIV.2311.06103\">https://doi.org/10.48550/ARXIV.2311.06103</a>.","short":"B. Prach, C. Lampert, ArXiv (n.d.).","ama":"Prach B, Lampert C. 1-Lipschitz neural networks are more expressive with N-activations. <i>arXiv</i>. doi:<a href=\"https://doi.org/10.48550/ARXIV.2311.06103\">10.48550/ARXIV.2311.06103</a>","apa":"Prach, B., &#38; Lampert, C. (n.d.). 1-Lipschitz neural networks are more expressive with N-activations. <i>arXiv</i>. <a href=\"https://doi.org/10.48550/ARXIV.2311.06103\">https://doi.org/10.48550/ARXIV.2311.06103</a>","ista":"Prach B, Lampert C. 1-Lipschitz neural networks are more expressive with N-activations. arXiv, 2311.06103.","ieee":"B. Prach and C. Lampert, “1-Lipschitz neural networks are more expressive with N-activations,” <i>arXiv</i>. ."},"tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","short":"CC BY (4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png"},"year":"2023","type":"preprint","main_file_link":[{"url":"https://doi.org/10.48550/arXiv.2311.06103","open_access":"1"}],"status":"public","external_id":{"arxiv":["2311.06103"]},"article_number":"2311.06103","publication":"arXiv","department":[{"_id":"GradSch"},{"_id":"ChLa"}],"article_processing_charge":"No","abstract":[{"lang":"eng","text":"A crucial property for achieving secure, trustworthy and interpretable deep learning systems is their robustness: small changes to a system's inputs should not result in large changes to its outputs. Mathematically, this means one strives for networks with a small Lipschitz constant. Several recent works have focused on how to construct such Lipschitz networks, typically by imposing constraints on the weight matrices. In this work, we study an orthogonal aspect, namely the role of the activation function. We show that commonly used activation functions, such as MaxMin, as well as all piece-wise linear ones with two segments unnecessarily restrict the class of representable functions, even in the simplest one-dimensional setting. We furthermore introduce the new N-activation function that is provably more expressive than currently popular activation functions. We provide code at this https URL."}],"related_material":{"record":[{"relation":"dissertation_contains","status":"public","id":"19759"}]},"language":[{"iso":"eng"}],"oa_version":"Preprint","title":"1-Lipschitz neural networks are more expressive with N-activations","date_updated":"2026-04-07T11:49:51Z","publication_status":"draft"},{"author":[{"last_name":"Peste","first_name":"Elena-Alexandra","id":"32D78294-F248-11E8-B48F-1D18A9856A87","full_name":"Peste, Elena-Alexandra"}],"doi":"10.15479/at:ista:13074","date_created":"2023-05-23T17:07:53Z","OA_place":"publisher","_id":"13074","supervisor":[{"first_name":"Christoph","orcid":"0000-0001-8622-7887","last_name":"Lampert","full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"},{"last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian"}],"citation":{"short":"A. Krumes, Efficiency and Generalization of Sparse Neural Networks, Institute of Science and Technology Austria, 2023.","mla":"Krumes, Alexandra. <i>Efficiency and Generalization of Sparse Neural Networks</i>. Institute of Science and Technology Austria, 2023, doi:<a href=\"https://doi.org/10.15479/at:ista:13074\">10.15479/at:ista:13074</a>.","chicago":"Krumes, Alexandra. “Efficiency and Generalization of Sparse Neural Networks.” Institute of Science and Technology Austria, 2023. <a href=\"https://doi.org/10.15479/at:ista:13074\">https://doi.org/10.15479/at:ista:13074</a>.","ama":"Krumes A. Efficiency and generalization of sparse neural networks. 2023. doi:<a href=\"https://doi.org/10.15479/at:ista:13074\">10.15479/at:ista:13074</a>","apa":"Krumes, A. (2023). <i>Efficiency and generalization of sparse neural networks</i>. Institute of Science and Technology Austria. <a href=\"https://doi.org/10.15479/at:ista:13074\">https://doi.org/10.15479/at:ista:13074</a>","ista":"Krumes A. 2023. Efficiency and generalization of sparse neural networks. Institute of Science and Technology Austria.","ieee":"A. Krumes, “Efficiency and generalization of sparse neural networks,” Institute of Science and Technology Austria, 2023."},"oa":1,"date_published":"2023-05-23T00:00:00Z","user_id":"ba8df636-2132-11f1-aed0-ed93e2281fdd","alternative_title":["ISTA Thesis"],"publication_identifier":{"issn":["2663-337X"]},"page":"147","file_date_updated":"2023-05-24T16:12:59Z","acknowledged_ssus":[{"_id":"ScienComp"}],"abstract":[{"lang":"eng","text":"Deep learning has become an integral part of a large number of important applications, and many of the recent breakthroughs have been enabled by the ability to train very large models, capable to capture complex patterns and relationships from the data. At the same time, the massive sizes of modern deep learning models have made their deployment to smaller devices more challenging; this is particularly important, as in many applications the users rely on accurate deep learning predictions, but they only have access to devices with limited memory and compute power. One solution to this problem is to prune neural networks, by setting as many of their parameters as possible to zero, to obtain accurate sparse models with lower memory footprint. Despite the great research progress in obtaining sparse models that preserve accuracy, while satisfying memory and computational constraints, there are still many challenges associated with efficiently training sparse models, as well as understanding their generalization properties.\r\n\r\nThe focus of this thesis is to investigate how the training process of sparse models can be made more efficient, and to understand the differences between sparse and dense models in terms of how well they can generalize to changes in the data distribution. We first study a method for co-training sparse and dense models, at a lower cost compared to regular training. With our method we can obtain very accurate sparse networks, and dense models that can recover the baseline accuracy. Furthermore, we are able to more easily analyze the differences, at prediction level, between the sparse-dense model pairs. Next, we investigate the generalization properties of sparse neural networks in more detail, by studying how well different sparse models trained on a larger task can adapt to smaller, more specialized tasks, in a transfer learning scenario. Our analysis across multiple pruning methods and sparsity levels reveals that sparse models provide features that can transfer similarly to or better than the dense baseline. However, the choice of the pruning method plays an important role, and can influence the results when the features are fixed (linear finetuning), or when they are allowed to adapt to the new task (full finetuning). Using sparse models with fixed masks for finetuning on new tasks has an important practical advantage, as it enables training neural networks on smaller devices. However, one drawback of current pruning methods is that the entire training cycle has to be repeated to obtain the initial sparse model, for every sparsity target; in consequence, the entire training process is costly and also multiple models need to be stored. In the last part of the thesis we propose a method that can train accurate dense models that are compressible in a single step, to multiple sparsity levels, without additional finetuning. Our method results in sparse models that can be competitive with existing pruning methods, and which can also successfully generalize to new tasks."}],"publication_status":"published","title":"Efficiency and generalization of sparse neural networks","oa_version":"Published Version","publisher":"Institute of Science and Technology Austria","month":"05","corr_author":"1","degree_awarded":"PhD","day":"23","ec_funded":1,"type":"dissertation","project":[{"_id":"2564DBCA-B435-11E9-9278-68D0E5697425","grant_number":"665385","call_identifier":"H2020","name":"International IST Doctoral Program"},{"_id":"268A44D6-B435-11E9-9278-68D0E5697425","grant_number":"805223","call_identifier":"H2020","name":"Elastic Coordination for Scalable Machine Learning"}],"year":"2023","status":"public","article_processing_charge":"No","has_accepted_license":"1","department":[{"_id":"GradSch"},{"_id":"DaAl"},{"_id":"ChLa"}],"language":[{"iso":"eng"}],"related_material":{"record":[{"relation":"part_of_dissertation","status":"public","id":"13053"},{"id":"11458","status":"public","relation":"part_of_dissertation"},{"status":"public","id":"12299","relation":"part_of_dissertation"}]},"date_updated":"2026-04-07T13:30:20Z","file":[{"content_type":"application/pdf","date_updated":"2023-05-24T16:11:16Z","file_size":2152072,"success":1,"file_name":"PhD_Thesis_Alexandra_Peste_final.pdf","file_id":"13087","date_created":"2023-05-24T16:11:16Z","checksum":"6b3354968403cb9d48cc5a83611fb571","creator":"epeste","access_level":"open_access","relation":"main_file"},{"file_id":"13088","date_created":"2023-05-24T16:12:59Z","file_name":"PhD_Thesis_APeste.zip","checksum":"8d0df94bbcf4db72c991f22503b3fd60","creator":"epeste","access_level":"closed","relation":"source_file","file_size":1658293,"date_updated":"2023-05-24T16:12:59Z","content_type":"application/zip"}],"ddc":["000"]},{"ec_funded":1,"project":[{"_id":"268A44D6-B435-11E9-9278-68D0E5697425","name":"Elastic Coordination for Scalable Machine Learning","call_identifier":"H2020","grant_number":"805223"}],"type":"conference","year":"2023","conference":{"name":"ICLR: International Conference on Learning Representations","start_date":"2023-05-01","end_date":"2023-05-05","location":"Kigali, Rwanda "},"publisher":"OpenReview","corr_author":"1","month":"05","day":"01","related_material":{"link":[{"url":"https://github.com/IST-DASLab/CrAM","relation":"software"}],"record":[{"relation":"dissertation_contains","id":"13074","status":"public"}]},"language":[{"iso":"eng"}],"date_updated":"2026-04-07T13:30:19Z","ddc":["000"],"file":[{"relation":"main_file","file_id":"17294","checksum":"a6eec897e13a91cdc3eeaf309801752c","date_created":"2024-07-22T09:09:45Z","file_name":"2023_ICLR_Peste.pdf","creator":"dernst","access_level":"open_access","file_size":458201,"date_updated":"2024-07-22T09:09:45Z","content_type":"application/pdf","success":1}],"external_id":{"arxiv":["2207.14200"]},"status":"public","article_processing_charge":"No","publication":"11th International Conference on Learning Representations ","has_accepted_license":"1","department":[{"_id":"GradSch"},{"_id":"DaAl"},{"_id":"ChLa"}],"acknowledgement":"AP, EK, DA received funding from the European Research Council (ERC) under the European\r\nUnion’s Horizon 2020 research and innovation programme (grant agreement No 805223 ScaleML). AV acknowledges the support of the French Agence Nationale de la Recherche (ANR), under grant ANR-21-CE48-0016 (project COMCOPT). We further acknowledge the support from the Scientific Service Units (SSU) of ISTA through resources provided by Scientific Computing (SciComp).","author":[{"full_name":"Peste, Elena-Alexandra","id":"32D78294-F248-11E8-B48F-1D18A9856A87","first_name":"Elena-Alexandra","last_name":"Peste"},{"last_name":"Vladu","first_name":"Adrian","full_name":"Vladu, Adrian"},{"first_name":"Eldar","last_name":"Kurtic","full_name":"Kurtic, Eldar","id":"47beb3a5-07b5-11eb-9b87-b108ec578218"},{"first_name":"Christoph","last_name":"Lampert","orcid":"0000-0001-8622-7887","full_name":"Lampert, Christoph","id":"40C20FD2-F248-11E8-B48F-1D18A9856A87"},{"id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian","last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian"}],"date_created":"2023-05-23T11:36:18Z","_id":"13053","date_published":"2023-05-01T00:00:00Z","oa":1,"citation":{"ista":"Krumes A, Vladu A, Kurtic E, Lampert C, Alistarh D-A. 2023. CrAM: A Compression-Aware Minimizer. 11th International Conference on Learning Representations . ICLR: International Conference on Learning Representations.","ieee":"A. Krumes, A. Vladu, E. Kurtic, C. Lampert, and D.-A. Alistarh, “CrAM: A Compression-Aware Minimizer,” in <i>11th International Conference on Learning Representations </i>, Kigali, Rwanda , 2023.","short":"A. Krumes, A. Vladu, E. Kurtic, C. Lampert, D.-A. Alistarh, in:, 11th International Conference on Learning Representations , OpenReview, 2023.","chicago":"Krumes, Alexandra, Adrian Vladu, Eldar Kurtic, Christoph Lampert, and Dan-Adrian Alistarh. “CrAM: A Compression-Aware Minimizer.” In <i>11th International Conference on Learning Representations </i>. OpenReview, 2023.","mla":"Krumes, Alexandra, et al. “CrAM: A Compression-Aware Minimizer.” <i>11th International Conference on Learning Representations </i>, OpenReview, 2023.","ama":"Krumes A, Vladu A, Kurtic E, Lampert C, Alistarh D-A. CrAM: A Compression-Aware Minimizer. In: <i>11th International Conference on Learning Representations </i>. OpenReview; 2023.","apa":"Krumes, A., Vladu, A., Kurtic, E., Lampert, C., &#38; Alistarh, D.-A. (2023). CrAM: A Compression-Aware Minimizer. In <i>11th International Conference on Learning Representations </i>. Kigali, Rwanda : OpenReview."},"arxiv":1,"user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","acknowledged_ssus":[{"_id":"ScienComp"}],"abstract":[{"text":"Deep neural networks (DNNs) often have to be compressed, via pruning and/or quantization, before they can be deployed in practical settings. In this work we propose a new compression-aware minimizer dubbed CrAM that modifies the optimization step in a principled way, in order to produce models whose local loss behavior is stable under compression operations such as pruning. Thus, dense models trained via CrAM should be compressible post-training, in a single step, without significant accuracy loss. Experimental results on standard benchmarks, such as residual networks for ImageNet classification and BERT models for language modelling, show that CrAM produces dense models that can be more accurate than the standard SGD/Adam-based baselines, but which are stable under weight pruning: specifically, we can prune models in one-shot to 70-80% sparsity with almost no accuracy loss, and to 90% with reasonable (∼1%) accuracy loss, which is competitive with gradual compression methods. Additionally, CrAM can produce sparse models which perform well for transfer learning, and it also works for semi-structured 2:4 pruning patterns supported by GPU hardware. The code for reproducing the results is available at this https URL .","lang":"eng"}],"publication_status":"published","title":"CrAM: A Compression-Aware Minimizer","oa_version":"Published Version","main_file_link":[{"url":"https://openreview.net/pdf?id=_eTZBs-yedr","open_access":"1"}],"file_date_updated":"2024-07-22T09:09:45Z","quality_controlled":"1"},{"day":"22","publisher":"IEEE","conference":{"location":"Vancouver, BC, Canada","end_date":"2023-06-24","name":"CVPR: Conference on Computer Vision and Pattern Recognition","start_date":"2023-06-17"},"month":"08","corr_author":"1","type":"conference","project":[{"name":"Vienna Graduate School on Computational Optimization","grant_number":"W1260-N35","_id":"9B9290DE-BA93-11EA-9121-9846C619BF3A"},{"_id":"268A44D6-B435-11E9-9278-68D0E5697425","name":"Elastic Coordination for Scalable Machine Learning","call_identifier":"H2020","grant_number":"805223"}],"year":"2023","ec_funded":1,"article_processing_charge":"No","department":[{"_id":"DaAl"},{"_id":"ChLa"}],"isi":1,"publication":"2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition","external_id":{"arxiv":["2304.12622"],"isi":["001062531308068"]},"status":"public","date_updated":"2026-05-19T11:20:27Z","language":[{"iso":"eng"}],"related_material":{"link":[{"relation":"software","url":"https://github.com/IST-DASLab/pruned-vision-model-bias"}],"record":[{"relation":"dissertation_contains","status":"public","id":"21854"}]},"citation":{"apa":"Iofinova, E. B., Krumes, A., &#38; Alistarh, D.-A. (2023). Bias in pruned vision models: In-depth analysis and countermeasures. In <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i> (pp. 24364–24373). Vancouver, BC, Canada: IEEE. <a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">https://doi.org/10.1109/cvpr52729.2023.02334</a>","short":"E.B. Iofinova, A. Krumes, D.-A. Alistarh, in:, 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition, IEEE, 2023, pp. 24364–24373.","chicago":"Iofinova, Eugenia B, Alexandra Krumes, and Dan-Adrian Alistarh. “Bias in Pruned Vision Models: In-Depth Analysis and Countermeasures.” In <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, 24364–73. IEEE, 2023. <a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">https://doi.org/10.1109/cvpr52729.2023.02334</a>.","mla":"Iofinova, Eugenia B., et al. “Bias in Pruned Vision Models: In-Depth Analysis and Countermeasures.” <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, IEEE, 2023, pp. 24364–73, doi:<a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">10.1109/cvpr52729.2023.02334</a>.","ama":"Iofinova EB, Krumes A, Alistarh D-A. Bias in pruned vision models: In-depth analysis and countermeasures. In: <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>. IEEE; 2023:24364-24373. doi:<a href=\"https://doi.org/10.1109/cvpr52729.2023.02334\">10.1109/cvpr52729.2023.02334</a>","ieee":"E. B. Iofinova, A. Krumes, and D.-A. Alistarh, “Bias in pruned vision models: In-depth analysis and countermeasures,” in <i>2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</i>, Vancouver, BC, Canada, 2023, pp. 24364–24373.","ista":"Iofinova EB, Krumes A, Alistarh D-A. 2023. Bias in pruned vision models: In-depth analysis and countermeasures. 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition. CVPR: Conference on Computer Vision and Pattern Recognition, 24364–24373."},"arxiv":1,"oa":1,"date_published":"2023-08-22T00:00:00Z","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","author":[{"orcid":"0000-0002-7778-3221","last_name":"Iofinova","first_name":"Eugenia B","id":"f9a17499-f6e0-11ea-865d-fdf9a3f77117","full_name":"Iofinova, Eugenia B"},{"full_name":"Peste, Elena-Alexandra","id":"32D78294-F248-11E8-B48F-1D18A9856A87","first_name":"Elena-Alexandra","last_name":"Peste"},{"last_name":"Alistarh","orcid":"0000-0003-3650-940X","first_name":"Dan-Adrian","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian"}],"doi":"10.1109/cvpr52729.2023.02334","date_created":"2024-01-10T08:42:40Z","_id":"14771","publication_identifier":{"eissn":["2575-7075"],"eisbn":["9798350301298"]},"acknowledgement":"The authors would like to sincerely thank Sara Hooker for her feedback during the development of this work. EI was supported in part by the FWF DK VGSCO, grant agreement number W1260-N35. AP and DA acknowledge generous ERC support, via Starting Grant 805223 ScaleML.","quality_controlled":"1","page":"24364-24373","main_file_link":[{"open_access":"1","url":"https://doi.org/10.48550/arXiv.2304.12622"}],"publication_status":"published","oa_version":"Preprint","title":"Bias in pruned vision models: In-depth analysis and countermeasures","abstract":[{"text":"Pruning—that is, setting a significant subset of the parameters of a neural network to zero—is one of the most popular methods of model compression. Yet, several recent works have raised the issue that pruning may induce or exacerbate bias in the output of the compressed model. Despite existing evidence for this phenomenon, the relationship between neural network pruning and induced bias is not well-understood. In this work, we systematically investigate and characterize this phenomenon in Convolutional Neural Networks for computer vision. First, we show that it is in fact possible to obtain highly-sparse models, e.g. with less than 10% remaining weights, which do not decrease in accuracy nor substantially increase in bias when compared to dense models. At the same time, we also find that, at higher sparsities, pruned models exhibit higher uncertainty in their outputs, as well as increased correlations, which we directly link to increased bias. We propose easy-to-use criteria which, based only on the uncompressed model, establish whether bias will increase with pruning, and identify the samples most susceptible to biased predictions post-compression. Our code can be found at https://github.com/IST-DASLab/pruned-vision-model-bias.","lang":"eng"}]}]