{"file":[{"success":1,"checksum":"ddbe981f3ad3f6cb6daf12c954822eb8","file_size":37788223,"creator":"dernst","content_type":"application/pdf","file_name":"2025_ICLR_Pariza.pdf","access_level":"open_access","relation":"main_file","file_id":"20109","date_created":"2025-08-04T08:09:43Z","date_updated":"2025-08-04T08:09:43Z"}],"month":"04","year":"2025","arxiv":1,"_id":"20036","oa":1,"article_processing_charge":"No","author":[{"last_name":"Pariza","full_name":"Pariza, Valentinos","first_name":"Valentinos"},{"first_name":"Mohammadreza","last_name":"Salehi","full_name":"Salehi, Mohammadreza"},{"first_name":"Gertjan","last_name":"Burghouts","full_name":"Burghouts, Gertjan"},{"id":"26cfd52f-2483-11ee-8040-88983bcc06d4","first_name":"Francesco","full_name":"Locatello, Francesco","orcid":"0000-0002-4850-0683","last_name":"Locatello"},{"last_name":"Asano","full_name":"Asano, Yuki M.","first_name":"Yuki M."}],"citation":{"mla":"Pariza, Valentinos, et al. “Near, Far: Patch-Ordering Enhances Vision Foundation Models’ Scene Understanding.” 13th International Conference on Learning Representations, ICLR, 2025, pp. 72303–30.","ista":"Pariza V, Salehi M, Burghouts G, Locatello F, Asano YM. 2025. Near, far: Patch-ordering enhances vision foundation models’ scene understanding. 13th International Conference on Learning Representations. ICLR: International Conference on Learning Representations, 72303–72330.","ieee":"V. Pariza, M. Salehi, G. Burghouts, F. Locatello, and Y. M. Asano, “Near, far: Patch-ordering enhances vision foundation models’ scene understanding,” in 13th International Conference on Learning Representations, Singapore, Singapore, 2025, pp. 72303–72330.","short":"V. Pariza, M. Salehi, G. Burghouts, F. Locatello, Y.M. Asano, in:, 13th International Conference on Learning Representations, ICLR, 2025, pp. 72303–72330.","chicago":"Pariza, Valentinos, Mohammadreza Salehi, Gertjan Burghouts, Francesco Locatello, and Yuki M. Asano. “Near, Far: Patch-Ordering Enhances Vision Foundation Models’ Scene Understanding.” In 13th International Conference on Learning Representations, 72303–30. ICLR, 2025.","apa":"Pariza, V., Salehi, M., Burghouts, G., Locatello, F., & Asano, Y. M. (2025). Near, far: Patch-ordering enhances vision foundation models’ scene understanding. In 13th International Conference on Learning Representations (pp. 72303–72330). Singapore, Singapore: ICLR.","ama":"Pariza V, Salehi M, Burghouts G, Locatello F, Asano YM. Near, far: Patch-ordering enhances vision foundation models’ scene understanding. In: 13th International Conference on Learning Representations. ICLR; 2025:72303-72330."},"OA_type":"diamond","external_id":{"arxiv":["2408.11054"]},"tmp":{"short":"CC BY (4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","image":"/images/cc_by.png"},"department":[{"_id":"FrLo"}],"abstract":[{"lang":"eng","text":"We introduce NeCo: Patch Neighbor Consistency, a novel self-supervised training loss that enforces patch-level nearest neighbor consistency across a student and teacher model. Compared to contrastive approaches that only yield binary learning signals, i.e. \"attract\" and \"repel\", this approach benefits from the more fine-grained learning signal of sorting spatially dense features relative to reference patches. Our method leverages differentiable sorting applied on top of pretrained representations, such as DINOv2-registers to bootstrap the learning signal and further improve upon them. This dense post-pretraining leads to superior performance across various models and datasets, despite requiring only 19 hours on a single GPU. This method generates high-quality dense feature encoders and establishes several new state-of-the-art results such as +2.3 % and +4.2% for non-parametric in-context semantic segmentation on ADE20k and Pascal VOC, +1.6% and +4.8% for linear segmentation evaluations on COCO-Things and -Stuff and improvements in the 3D understanding of multi-view consistency on SPair-71k, by more than 1.5%."}],"status":"public","language":[{"iso":"eng"}],"OA_place":"publisher","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","type":"conference","date_published":"2025-04-01T00:00:00Z","ddc":["000"],"page":"72303-72330","publication_identifier":{"isbn":["9798331320850"]},"day":"01","date_updated":"2025-08-04T08:10:55Z","date_created":"2025-07-20T22:02:03Z","file_date_updated":"2025-08-04T08:09:43Z","publication":"13th International Conference on Learning Representations","scopus_import":"1","conference":{"start_date":"2025-04-24","name":"ICLR: International Conference on Learning Representations","location":"Singapore, Singapore","end_date":"2025-04-28"},"quality_controlled":"1","oa_version":"Published Version","publication_status":"published","title":"Near, far: Patch-ordering enhances vision foundation models' scene understanding","has_accepted_license":"1","publisher":"ICLR"}