@article{20009,
  abstract     = {The suppression of recombination between young X and Y chromosomes is a crucial step in their evolution, but why it occurs is not known. The detailed characterization of the polymorphic sex chromosomes of the fourspine stickleback by Liu et al. promises to shed new light on this longstanding question.},
  author       = {Vicoso, Beatriz},
  issn         = {0169-5347},
  journal      = {Trends in Ecology and Evolution},
  number       = {8},
  pages        = {728--730},
  publisher    = {Elsevier},
  title        = {{Sex chromosome evolution in action in fourspine sticklebacks}},
  doi          = {10.1016/j.tree.2025.06.010},
  volume       = {40},
  year         = {2025},
}

@article{20010,
  abstract     = {Chirality-induced spin selectivity (CISS), which refers to the ability of chiral molecules to preferentially select spins during electron transfer, has attracted great attention during the past two decades. However, the theoretical and experimental understanding of the CISS effect remains preliminary. In this study, we demonstrate that there is no distinguishable CISS effect in the case of coherent electron transport through single chiral molecular junctions for a set of four molecule studied here. Our conclusion is based on statistical evaluations of thousands of single-molecule junctions across four different molecules with different origins of chirality measured by the scanning tunneling microscope-based break-junction technique. The experimental results for all molecules show no dependence on external magnetic field or chirality in both conductance and current–voltage measurements. In addition, ab initio Hartree-Fork calculations combined with the nonequilibrium Green’s function method reveal that the spin–orbit coupling within chiral junctions bound to a few gold atoms is generally too weak to induce detectable spin polarizations from spin flipping or spin filtering during the ultrafast electron-transport time scale. The absence of an observable CISS effect in the coherent electron-transport regime suggests that the effect may only be found in other electron-transfer regimes and requires further experimental and theoretical efforts to achieve a comprehensive understanding.},
  author       = {Li, Liang and Shi, Wanzhuo and Mahajan, Ankit and Zhang, Junxiang and Gómez-Gómez, Marta and Labella, Jorge and Louie, Shayan and Torres, Tomás and Barlow, Stephen and Marder, Seth R. and Reichman, David R. and Venkataraman, Latha},
  issn         = {1520-5126},
  journal      = {Journal of the American Chemical Society},
  number       = {28},
  pages        = {25043--25051},
  publisher    = {American Chemical Society},
  title        = {{Too fast for spin flipping: Absence of chirality-induced spin selectivity in coherent electron transport through single-molecule junctions}},
  doi          = {10.1021/jacs.5c08517},
  volume       = {147},
  year         = {2025},
}

@article{20011,
  abstract     = {Heat transport in glasses over a wide temperature range is critical for applications in gate dielectrics and thermal insulators but remains poorly understood due to the challenges in modeling vibrational anharmonicity and configurational dynamics across the glass transition. Recent predictions show an unusual decrease in thermal conductivity (κ) with temperature in amorphous hafnia (a-HfO2), contrasting with the typical trend in glasses. Using molecular dynamics with a machine-learning-based neuroevolution potential, we compute κ of a-HfO2 from 50 K to 2000 K. At low temperatures, the Wigner transport equation captures both anharmonicity and quantum statistics. Above 1200 K, atomic diffusion invalidates the quasiparticle picture, and we resort to the Green–Kubo method to capture convective transport. We further extend the Wigner transport equation to supercooled a-HfO2, revealing the crucial role of low-frequency modes in facilitating heat transport. The computed κ, based on both Green–Kubo and Wigner transport theories, increases continuously with temperature up to 2000 K.},
  author       = {Zeng, Zezhu and Liang, Xia and Fan, Zheyong and Chen, Yue and Simoncelli, Michele and Cheng, Bingqing},
  issn         = {2639-4979},
  journal      = {ACS Materials Letters},
  pages        = {2695--2701},
  publisher    = {American Chemical Society},
  title        = {{Thermal transport of amorphous hafnia across the glass transition}},
  doi          = {10.1021/acsmaterialslett.5c00263},
  year         = {2025},
}

@inproceedings{20024,
  abstract     = {Cooperative software verification divides the task of software verification among several verification tools in order to increase efficiency and effectiveness. The basic approach is to let verifiers work on different parts of a program and at the end join verification results. While this idea is intuitively appealing, cooperative verification is usually hindered by the fact that program decomposition (1) is often static, disregarding strengths and weaknesses of employed verifiers, and (2) often represents the decomposed program parts in a specific proprietary format, thereby making the use of off-the-shelf verifiers in cooperative verification difficult. In this paper, we propose a novel cooperative verification scheme that we call dynamic program splitting (DPS). Splitting decomposes programs into (smaller) programs, and thus directly enables the use of off-the-shelf tools. In DPS, splitting is dynamically applied on demand: Verification starts by giving a verification task (a program plus a correctness specification) to a verifier V1. Whenever V1 finds the current task to be hard to verify, it splits the task (i.e., the program) and restarts verification on subtasks. DPS continues until (1) a violation is found, (2) all subtasks are completed or (3) some user-defined stopping criterion is met. In the latter case, the remaining uncompleted subtasks are merged into a single one and are given to a next verifier V2, repeating the same procedure on the still unverified program parts. This way, the decomposition is steered by what is hard to verify for particular verifiers, leveraging their complementary strengths. We have implemented dynamic program splitting and evaluated it on benchmarks of the annual software verification competition SV-COMP. The evaluation shows that cooperative verification with DPS is able to solve verification tasks that none of the constituent verifiers can solve, without any significant overhead.},
  author       = {Richter, Cedric and Chalupa, Marek and Jakobs, Marie-Christine and Wehrheim, Heike},
  booktitle    = {47th International Conference on Software Engineering},
  isbn         = {9798331505691},
  issn         = {1558-1225},
  location     = {Ottawa, ON, Canada},
  pages        = {2087--2099},
  publisher    = {IEEE},
  title        = {{Cooperative software verification via dynamic program splitting}},
  doi          = {10.1109/ICSE55347.2025.00092},
  year         = {2025},
}

@article{20026,
  abstract     = {Deep Convective Systems (DCSs) reaching scales of 100–1000 km play a pivotal role as the primary precipitation source in the tropics. Those systems can have large cloud shields, and thus not only affect severe precipitation patterns but also play a crucial part in modulating the tropical radiation budget. Understanding the complex factors that control how these systems grow and how they will behave in a warming climate remain fundamental challenges. Research efforts have been directed, on one hand, towards understanding the environmental control on these systems, and on the other hand, towards exploring the internal potential of systems to develop and self-aggregate in idealized simulations. However, we still lack understanding on the relative role of the environment and internal feedbacks on DCS mature size and why. The novel high-resolution global SAM simulation from the DYAMOND project, combined with the TOOCAN Lagrangian tracking of DCSs and machine learning tools, offers an unprecedented opportunity to explore this question. We find that a system’s growth rate during the first 2 h of development predicts its final size with a Pearson correlation coefficient of 0.65. Beyond this period, growth rate emerges as the strongest predictor. However, in the early stages, additional factors–such as ice water path heterogeneity, migration distance, interactions with neighboring systems, and deep shear–play a more significant role. Our study quantitatively assesses the relative influence of internal versus external factors on the mature cloud shield size. Our results show that system-intrinsic properties exert a stronger influence than environmental conditions, suggesting that the initial environment does not strictly constrain final system size, particularly for larger systems where internal dynamics dominate.},
  author       = {Abramian, Sophie and Muller, Caroline J and Risi, Camille and Fiolleau, Thomas and Roca, Rémy},
  issn         = {2397-3722},
  journal      = {npj Climate and Atmospheric Science},
  publisher    = {Springer Nature},
  title        = {{How key features of early development shape deep convective systems}},
  doi          = {10.1038/s41612-025-01154-1},
  volume       = {8},
  year         = {2025},
}

@article{20027,
  abstract     = {We present the first results of the JWST Emission Line Survey (JELS). Utilizing the first NIRCam narrow-band imaging at 4.7 μm, over 63 arcmin2 in the PRIMER/COSMOS field, we have identified 609 emission line galaxy candidates. From these, we robustly selected 35 H α star-forming galaxies at z ∼ 6.1, with H α star-formation rates (SFRH α) of ∼ 0.9 − 15 M yr−1.
Combining our unique H α sample with the exquisite panchromatic data in the field, we explored their physical properties and star-formation histories, and compared these to a broad-band selected sample at z ∼ 6 which has offered vital new insights into the nature of high-redshift galaxies. UV-continuum slopes (β) were considerably redder for our H α sample (β ∼ −1.92)
compared to the broad-band sample (β ∼ −2.35). This was not due to dust attenuation as our H α sample was relatively dustpoor (median AV = 0.23); instead, we argue that the reddened slopes could be due to nebular continuum. We compared SFRH α and the UV-continuum-derived SFRUV to SED-fitted measurements averaged over canonical time-scales of 10 and 100 Myr (SFR10 and SFR100). We found an increase in recent SFR for our sample of H α emitters, particularly at lower stellar masses (< 109 M). We also found that SFRH α strongly traces SFR averaged over 10 Myr time-scales, whereas the UV-continuum overpredicts SFR on 100 Myr time-scales at low stellar masses. These results point to our H α sample undergoing ‘bursty’ star
formation. Our F356W z ∼ 6 sample showed a larger scatter in SFR10/SFR100 across all stellar masses, which has highlighted how narrow-band photometric selections of H α emitters are key to quantifying the burstiness of star-formation activity. },
  author       = {Pirie, C. A. and Best, P. N. and Duncan, K. J. and Mcleod, D. J. and Cochrane, R. K. and Clausen, M. and Dunlop, J. S. and Flury, S. R. and Geach, J. E. and Hale, C. L. and Ibar, E. and Kondapally, R. and Li, Zefeng and Matthee, Jorryt J and Mclure, R. J. and Ossa-Fuentes, L. and Patrick, A. L. and Smail, Ian and Sobral, D. and Stephenson, H. M.O. and Stott, J. P. and Swinbank, A. M.},
  issn         = {1365-2966},
  journal      = {Monthly Notices of the Royal Astronomical Society},
  number       = {2},
  pages        = {1348--1376},
  publisher    = {Oxford University Press},
  title        = {{The JWST Emission Line Survey (JELS): An untargeted search for H α emission line galaxies at z > 6 and their physical properties}},
  doi          = {10.1093/mnras/staf1006},
  volume       = {541},
  year         = {2025},
}

@article{20028,
  abstract     = {We present the JWST Emission-Line Survey (JELS), a JWST imaging programme exploiting the wavelength coverage and sensitivity of the Near-Infrared Camera (NIRCam) to extend narrow-band rest-optical emission-line selection into the Epoch of Reionization (EoR) for the first time, and to enable unique studies of the resolved ionized gas morphology in individual galaxies across cosmic history. The primary JELS observations comprise ∼ 4.7 μm narrow-band imaging over ∼ 63 arcmin2 designed to enable selection of H α emitters at z ∼ 6.1 and a host of novel emission-line samples, including [O III] (z ∼ 8.3) and Paschen α/β (z ∼ 1.5/2.8). For the F466N/F470N narrow-band observations, the emission-line sensitivities achieved are up to ∼ 2× more sensitive than current slitless spectroscopy surveys (5σ limits of 0.8–1.2×10−18 erg s−1cm−2), corresponding to unobscured H α star formation rates (SFRs) of 0.9–1.3 M yr−1 at z ∼ 6.1, extending emission-line selections in the EoR to fainter populations. Simultaneously, JELS also adds F200W broad-band and F212N narrow-band imaging (H α at z ∼ 2.23) that probes SFRs  5× fainter than previous ground-based narrow-band studies (∼ 0.2 M yr−1), offering an unprecedented resolved view of star formation at cosmic noon. We present the detailed JELS design, key data processing steps specific to the survey observations, and demonstrate the exceptional data quality and imaging sensitivity achieved. We then summarize the key scientific goals of JELS, demonstrate the precision and accuracy of the expected redshift and measured emission-line recovery through detailed simulations, and present examples of spectroscopically confirmed H α and [O III] emitters discovered by JELS that illustrate the novel parameter space probed.},
  author       = {Duncan, K. J. and Mcleod, D. J. and Best, P. N. and Pirie, C. A. and Clausen, M. and Cochrane, R. K. and Dunlop, J. S. and Flury, S. R. and Geach, J. E. and Grogin, N. A. and Hale, C. L. and Ibar, E. and Kondapally, R. and Li, Zefeng and Matthee, Jorryt J and Mclure, R. J. and Ossa-Fuentes, Luis and Patrick, A. L. and Smail, Ian and Sobral, D. and Stephenson, H. M.O. and Stott, J. P. and Swinbank, A. M.},
  issn         = {1365-2966},
  journal      = {Monthly Notices of the Royal Astronomical Society},
  number       = {2},
  pages        = {1329--1347},
  publisher    = {Oxford University Press},
  title        = {{The JWST Emission-Line Survey: Extending rest-optical narrow-band emission-line selection into the Epoch of Reionization}},
  doi          = {10.1093/mnras/staf1061},
  volume       = {541},
  year         = {2025},
}

@article{20029,
  abstract     = {Vacuolar acidification is crucial for the homeostasis of intracellular pH and the recycling of proteins and nutrients in cells, thereby playing important roles in various physiological processes related to vacuolar function. The key factors regulating vacuolar acidification and underlying mechanisms remain unclear. Here, we report that Arabidopsis phospholipase Dζ2 (PLDζ2) promotes the acidification of the vacuolar lumen to stimulate autophagic degradation under phosphorus deficiency. The pldζ2 mutant massively accumulates autophagic structures while exhibiting premature leaf senescence under nutrient starvation. Impaired autophagic flux, lytic vacuole morphology, and lytic degradation in pldζ2 indicate that PLDζ2 regulates autophagy by affecting the vacuolar function. PLDζ2 locates in both tonoplast and cytoplasm. Genetic, structural, and biochemical studies demonstrate that PLDζ2 directly interacts with vacuolar-type ATPase (V-ATPase) subunit D (VATD) to promote vacuolar acidification and autophagy under phosphorus starvation. These findings reveal the importance of V-ATPase and vacuolar pH in autophagic activity and provide clues in elucidating the regulatory mechanism of vacuolar acidification.},
  author       = {Guan, Bin and Xie, Ke Xuan and Du, Xin Qiao and Bai, Yu Xuan and Hao, Peng Chao and Lin, Wen Hui and Friml, Jiří and Xue, Hong Wei},
  issn         = {2211-1247},
  journal      = {Cell Reports},
  number       = {7},
  publisher    = {Elsevier},
  title        = {{Arabidopsis phospholipase Dζ2 facilitates vacuolar acidification and autophagy under phosphorus starvation by interacting with VATD}},
  doi          = {10.1016/j.celrep.2025.116024},
  volume       = {44},
  year         = {2025},
}

@article{20030,
  abstract     = {We report the discovery of a Lyα emitter (LAE) candidate in the immediate foreground of the quasar PSO J158-14 at zQSO = 6.0685 at a projected distance ∼29 pkpc that is associated with an extremely metal-poor absorption system. This system was found in archival observations of the quasar field with the Very Large Telescope (VLT)/Multi-Unit Spectroscopic Explorer (MUSE) and was previously missed in searches of absorption systems using quasar absorption line spectroscopy, as it imparts no detectable metal absorption lines on the background quasar spectrum. The detected Lyα emission line at a redshift of zLAE = 6.0323 is well aligned with the outer edge of the quasar’s proximity zone and can plausibly cause its observed damping wing if it is associated with a proximate subdamped Lyα absorption system with a column density of log Nhi/cm^-2 19.7. A >10 hr medium-resolution spectrum of the quasar observed with the Magellan/Folded-port InfraRed Echellette (FIRE) and VLT/X-Shooter spectrographs reveals a metallicity constraint of [Z/H] < −3. Such low metallicity makes this system an extremely metal-poor galaxy candidate and provides an exciting site to study possible signatures of Population III stars.},
  author       = {Ďurovčíková, Dominika and Eilers, Anna Christina and Simcoe, Robert A. and Welsh, Louise and Meyer, Romain A. and Matthee, Jorryt J and Ryan-Weber, Emma V. and Yue, Minghao and Katz, Harley and Satyavolu, Sindhu and Becker, George and Davies, Frederick B. and Farina, Emanuele Paolo},
  issn         = {2041-8213},
  journal      = {The Astrophysical Journal Letters},
  number       = {2},
  publisher    = {IOP Publishing},
  title        = {{An extremely metal-poor Lyα emitter candidate at z = 6 revealed through absorption spectroscopy}},
  doi          = {10.3847/2041-8213/ade71c},
  volume       = {987},
  year         = {2025},
}

@article{20031,
  abstract     = {The central vacuole is a multifunctional organelle with the most significant occupancy in a differentiated plant cell. Plants depend on the function of the vacuole for critical development, growth, and environmental responses. As the cell expands, the vacuole changes shape and size, increasing its membrane and luminal content. The set of these events is called the vacuolar configuration process, which has not been well described. Our research highlights the impact of plasma membrane internalization on vacuole morphology during the vacuolar configuration process. We observed a direct correlation between differential endocytosis rates and the enrichment of vacuolar membranous structures. Chemical and genetic interference with clathrin-mediated endocytosis (CME) revealed that it is required for the vacuolar configuration of growing root cells. The contribution of CME to the vacuole configuration process co-occurs with the induction of post-trans-Golgi network (TGN)/early endosome (EE) trafficking with the participation of the Rab GTPases ARA6 and ARA7. Our results show that the CME plays an active role during vacuole configuration, most probably carrying the material that allows the establishment of the vacuole in elongating cells. Since membrane trafficking through the EE/TGN is required to reach the vacuole, additional players must be defined.},
  author       = {Osorio-Navarro, Claudio and Neira-Valenzuela, Gabriel and Sierra, Paula and Adamowski, Maciek and Toledo, Jorge and Norambuena, Lorena},
  issn         = {1460-2431},
  journal      = {Journal of Experimental Botany},
  number       = {10},
  pages        = {2700--2714},
  publisher    = {Oxford University Press},
  title        = {{The configuration of the vacuole is driven by clathrin-mediated trafficking in root cells of Arabidopsis thaliana}},
  doi          = {10.1093/jxb/eraf084},
  volume       = {76},
  year         = {2025},
}

@inproceedings{20032,
  abstract     = {We propose Scalable Mechanistic Neural Network (S-MNN), an enhanced neural network framework designed for scientific machine learning applications involving long temporal sequences. By reformulating the original Mechanistic Neural Network (MNN) (Pervez et al., 2024), we reduce the computational time and space complexities from cubic and quadratic with respect to the sequence length, respectively, to linear. This significant improvement enables efficient modeling of long-term dynamics without sacrificing accuracy or interpretability. Extensive experiments demonstrate that S-MNN matches the original MNN in precision while substantially reducing computational resources. Consequently, S-MNN can drop-in replace the original MNN in applications, providing a practical and efficient tool for integrating mechanistic bottlenecks into neural network models of complex dynamical systems. Source code is available at https://github.com/IST-DASLab/ScalableMNN.},
  author       = {Chen, Jiale and Yao, Dingling and Pervez, Adeel A and Alistarh, Dan-Adrian and Locatello, Francesco},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {63716--63737},
  publisher    = {ICLR},
  title        = {{Scalable mechanistic neural networks}},
  year         = {2025},
}

@inproceedings{20033,
  abstract     = {A growing number of machine learning scenarios rely on knowledge distillation where one uses the output of a surrogate model as labels to supervise the training of a target model. In this work, we provide a sharp characterization of this process for ridgeless, high-dimensional regression, under two settings: (i) model shift, where the surrogate model is arbitrary, and (ii) distribution shift, where the surrogate model is the solution of empirical risk minimization with out-of-distribution data. In both cases, we characterize the precise risk of the target model through non-asymptotic bounds in terms of sample size and data distribution under mild conditions. As a consequence, we identify the form of the optimal surrogate model, which reveals the benefits and limitations of discarding weak features in a data-dependent fashion. In the context of weak-to-strong (W2S) generalization, this has the interpretation that (i) W2S training, with the surrogate as the weak model, can provably outperform training with strong labels under the same data budget, but (ii) it is unable to improve the data scaling law. We validate our results on numerical experiments both on ridgeless regression and on neural network architectures.},
  author       = {Emrullah Ildiz, M. and Gozeten, Halil Alperen and Taga, Ege Onur and Mondelli, Marco and Oymak, Samet},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {2967--3006},
  publisher    = {ICLR},
  title        = {{High-dimensional analysis of knowledge distillation: Weak-to-Strong generalization and scaling laws}},
  year         = {2025},
}

@inproceedings{20034,
  abstract     = {We introduce LDAdam, a memory-efficient optimizer for training large models, that performs adaptive optimization steps within lower dimensional subspaces, while consistently exploring the full parameter space during training. This strategy keeps the optimizer's memory footprint to a fraction of the model size. LDAdam relies on a new projection-aware update rule for the optimizer states that allows for transitioning between subspaces, i.e., estimation of the statistics of the projected gradients. To mitigate the errors due to low-rank projection, LDAdam integrates a new generalized error feedback mechanism, which explicitly accounts for both gradient and optimizer state compression. We prove the convergence of LDAdam under standard assumptions, and provide empirical evidence that LDAdam allows for efficient fine-tuning and pre-training of language models.},
  author       = {Robert, Thomas and Safaryan, Mher and Modoranu, Ionut-Vlad and Alistarh, Dan-Adrian},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {101877--101913},
  publisher    = {ICLR},
  title        = {{LDAdam: Adaptive optimization from low-dimensional gradient statistics}},
  year         = {2025},
}

@inproceedings{20035,
  abstract     = {Deep neural networks (DNNs) at convergence consistently represent the training data in the last layer via a geometric structure referred to as neural collapse. This empirical evidence has spurred a line of theoretical research aimed at proving the emergence of neural collapse, mostly focusing on the unconstrained features model. Here, the features of the penultimate layer are free variables, which makes the model data-agnostic and puts into question its ability to capture DNN training. Our work addresses the issue, moving away from unconstrained features and
studying DNNs that end with at least two linear layers. We first prove generic guarantees on neural collapse that assume (i) low training error and balancedness of linear layers (for within-class variability collapse), and (ii) bounded conditioning of the features before the linear part (for orthogonality of class-means, and their alignment with weight matrices). The balancedness refers to the fact that W⊤ℓ+1Wℓ+1 ≈ WℓW⊤ℓfor any pair of consecutive weight matrices of the linear part, and the bounded conditioning requires a well-behaved ratio between largest and smallest non-zero singular values of the features. We then show that such assumptions hold for gradient descent training with weight decay: (i) for networks with a wide first layer, we prove low training error and balancedness, and (ii) for solutions that are either nearly optimal or stable under large learning rates, we additionally prove the bounded conditioning. Taken together, our results are the first to show neural collapse in the end-to-end training of DNNs.},
  author       = {Jacot, Arthur and Súkeník, Peter and Wang, Zihan and Mondelli, Marco},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {1905--1931},
  publisher    = {ICLR},
  title        = {{Wide neural networks trained with weight decay provably exhibit neural collapse}},
  year         = {2025},
}

@inproceedings{20036,
  abstract     = {We introduce NeCo: Patch Neighbor Consistency, a novel self-supervised training loss that enforces patch-level nearest neighbor consistency across a student and teacher model. Compared to contrastive approaches that only yield binary learning signals, i.e. "attract" and "repel", this approach benefits from the more fine-grained learning signal of sorting spatially dense features relative to reference patches. Our method leverages differentiable sorting applied on top of pretrained representations, such as DINOv2-registers to bootstrap the learning signal and further improve upon them. This dense post-pretraining leads to superior performance across various models and datasets, despite requiring only 19 hours on a single GPU. This method generates high-quality dense feature encoders and establishes several new state-of-the-art results such as +2.3 % and +4.2% for non-parametric in-context semantic segmentation on ADE20k and Pascal VOC, +1.6% and +4.8% for linear segmentation evaluations on COCO-Things and -Stuff and improvements in the 3D understanding of multi-view consistency on SPair-71k, by more than 1.5%.},
  author       = {Pariza, Valentinos and Salehi, Mohammadreza and Burghouts, Gertjan and Locatello, Francesco and Asano, Yuki M.},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {72303--72330},
  publisher    = {ICLR},
  title        = {{Near, far: Patch-ordering enhances vision foundation models' scene understanding}},
  year         = {2025},
}

@inproceedings{20037,
  abstract     = {Disentangling polysemantic neurons is at the core of many current approaches to interpretability of large language models. Here we attempt to study how disentanglement can be used to understand performance, particularly under weight sparsity, a leading post-training optimization technique. We suggest a novel measure for estimating neuronal entanglement: the Wasserstein distance of a neuron's output distribution to a Gaussian. Moreover, we show the existence of a small number of highly entangled "Wasserstein Neurons" in each linear layer of an LLM, characterized by their highly non-Gaussian output distributions, their role in mapping similar inputs to dissimilar outputs, and their significant impact on model accuracy. To study these phenomena, we propose a new experimental framework for disentangling polysemantic neurons. Our framework separates each layer's inputs to create a mixture of experts where each neuron's output is computed by a mixture of neurons of lower Wasserstein distance, each better at maintaining accuracy when sparsified without retraining. We provide strong evidence that this is because the mixture of sparse experts is effectively disentangling the input-output relationship of individual neurons, in particular the difficult Wasserstein neurons.},
  author       = {Sawmya, Shashata and Kong, Linghao and Markov, Ilia and Alistarh, Dan-Adrian and Shavit, Nir},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {26244--26274},
  publisher    = {ICLR},
  title        = {{Wasserstein distances, neuronal entanglement, and sparsity}},
  year         = {2025},
}

@inproceedings{20038,
  abstract     = {Pruning eliminates unnecessary parameters in neural networks; it offers a promising solution to the growing computational demands of large language models (LLMs). While many focus on post-training pruning, sparse pre-training--which combines pruning and pre-training into a single phase--provides a simpler alternative. In this work, we present the first systematic exploration of optimal sparse pre-training configurations for LLMs through an examination of 80 unique pruning schedules across different sparsity levels and training durations. We find that initiating pruning at 25% of total training compute and concluding at 75% achieves near-optimal final evaluation loss. These findings provide valuable insights for efficient and effective sparse pre-training of LLMs. Furthermore, we propose a new scaling law that modifies the Chinchilla scaling law to use the average parameter count over pre-training. Through empirical and theoretical validation, we demonstrate that this modified scaling law accurately models evaluation loss for both sparsely and densely pre-trained LLMs, unifying scaling laws across pre-training paradigms. Our findings indicate that while sparse pre-training achieves the same final model quality as dense pre-training for equivalent compute budgets, it provides substantial benefits through reduced model size, enabling significant potential computational savings during inference.},
  author       = {Jin, Tian and Humayun, Ahmed Imtiaz and Evci, Utku and Subramanian, Suvinay and Yazdanbakhsh, Amir and Alistarh, Dan-Adrian and Dziugaite, Gintare Karolina},
  booktitle    = {13th International Conference on Learning Representations},
  isbn         = {9798331320850},
  location     = {Singapore, Singapore},
  pages        = {85165--85181},
  publisher    = {ICLR},
  title        = {{The journey matters: Average parameter count over pre-training unifies sparse and dense scaling laws}},
  year         = {2025},
}

@article{20040,
  abstract     = {Contractive coupling rates have been recently introduced by Conforti as a tool to establish convex Sobolev inequalities (including modified log-Sobolev and Poincaré inequality) for some classes of Markov chains. In this work, for most of the examples discussed by Conforti, we use contractive coupling rates to prove stronger inequalities, in the form of curvature lower bounds (in entropic and discrete Bakry–Émery sense) and geodesic convexity of some entropic functionals. In addition, we recall and give straightforward generalizations of some notions of coarse Ricci curvature, and we discuss some of their properties and relations with the concepts of couplings and coupling rates: as an application, we show exponential contraction of the p-Wasserstein distance for the heat flow in the aforementioned examples.},
  author       = {Pedrotti, Francesco},
  issn         = {1050-5164},
  journal      = {The Annals of Applied Probability},
  number       = {1},
  pages        = {196 -- 250},
  publisher    = {Institute of Mathematical Statistics},
  title        = {{Contractive coupling rates and curvature lower bounds for Markov chains}},
  doi          = {10.1214/24-aap2113},
  volume       = {35},
  year         = {2025},
}

@article{20043,
  abstract     = {We establish an isomorphism of complex K-theory of the moduli space  M  of “SL n​ ”-Higgs bundles of degree d and rank n (in the sense of Hausel–Thaddeus) and twisted complex K-theory of the orbifold  M  of PGL n​ -Higgs bundles of degree e, where (n,d)=(n,e)=1. Along the way, we prove the vanishing of torsion for H ∗ ( M ) and certain twisted complex K-theory groups of  M . We also extend Arinkin’s autoduality of compactified Jacobian to a derived equivalence between SL n​ - and PGL n​ -Hitchin systems over the elliptic locus. In the appendix, we develop a formalism of G-sheaves of spectra, generalising equivariant homotopy theory to a relative setting.},
  author       = {Groechenig, Michael and Shen, Shiyu},
  issn         = {1435-9863},
  journal      = {Journal of the European Mathematical Society},
  publisher    = {EMS Press},
  title        = {{Complex K-theory of moduli spaces of Higgs bundles}},
  doi          = {10.4171/jems/1601},
  year         = {2025},
}

@article{20044,
  abstract     = {Genetic trade-offs—which occur when variants that are beneficial in some contexts of natural selection are harmful in others—can influence a wide range of evolutionary phenomena, from the maintenance of genetic variation to the evolution of aging and sex differences. An extensive body of evolutionary theory has focused on the consequences of such trade-offs, and recent analyses of Fisher’s geometric model have further quantified the expected proportion of new mutations that exhibit trade-offs. However, the theory remains silent regarding the prevalence of trade-offs among the variants that contribute to adaptation. Here, we extend Fisher’s geometric model to predict the prevalence of trade-offs among the adaptive mutations that become established or fixed in a population. We consider trade-offs between sexes, habitats, fitness components, and temporally fluctuating environments. In all 4 scenarios, trade-off alleles are consistently under-represented among established relative to new beneficial mutations—an effect that arises from the greater susceptibility of trade-off alleles to genetic drift. Adaptation during a population size decline exacerbates this deficit of trade-offs among established mutations, whereas population expansions dampen it. Consequently, threatened populations should primarily adapt using unconditionally beneficial alleles, while invasive populations are more prone to adaptation using variants that exhibit trade-offs.},
  author       = {Connallon, Tim and Czuppon, Peter and Olito, Colin and Goedert, Debora and Kokko, Hanna and Nava-Bolaños, Angela and Nilén, Sofie and Svensson, Erik I and Zwoinska, Martyna and Dutoit, Ludovic and Ruzicka, Filip},
  issn         = {1558-5646},
  journal      = {Evolution},
  number       = {7},
  pages        = {1243--1255},
  publisher    = {Oxford University Press},
  title        = {{Predicting the prevalence of genetic trade-offs among adaptive substitutions}},
  doi          = {10.1093/evolut/qpaf061},
  volume       = {79},
  year         = {2025},
}