@article{21987,
  abstract     = {We introduce JODIE, a genetic joint modeling approach that estimates how DNA loci influence human traits by partitioning genetic effects into four components: direct effects (from a child’s alleles), indirect maternal and paternal effects (from parents’ alleles), and parent-of-origin (PofO) effects (dependent on parental transmission of alleles), while uniquely accounting for assortative mating. We analyze 30,000 child-mother-father trios from the Estonian Biobank and the Norwegian Mother, Father, and Child Cohort, focusing on height, body mass index, and childhood educational test scores. We find direct effects to be the largest contributor to trait variation, but combined, indirect parental and PofO effects are similarly substantial. We support our results by within-family genome-wide association testing and identify 276 independently associated DNA regions with a complex interplay between direct, indirect, and PofO effects. By joint modeling, we show that direct, indirect, and PofO effects collectively shape human phenotypic variation across loci genome-wide.},
  author       = {Krätschmer, Ilse and Hegemann, Laura and Hofmeister, Robin J. and Corfield, Elizabeth C. and Mahmoudi, Mahdi and Delaneau, Olivier and Andreassen, Ole A. and Campbell, Archie and Hayward, Caroline and Marioni, Riccardo E. and Ystrom, Eivind and Havdahl, Alexandra and Robinson, Matthew Richard},
  issn         = {2666-979X},
  journal      = {Cell Genomics},
  publisher    = {Elsevier},
  title        = {{Separating direct, indirect, and parent-of-origin genetic effects in the human population}},
  doi          = {10.1016/j.xgen.2026.101277},
  year         = {2026},
}

@article{21488,
  abstract     = {Human height is a model for the genetic analysis of complex traits, and recent studies suggest the presence of thousands of common genetic variant associations and hundreds of low-frequency/rare variants. Here, we develop a new algorithmic paradigm based on approximate message passing (genomic vector approximate message passing [gVAMP]) for identifying DNA sequence variants associated with complex traits and common diseases in large-scale whole-genome sequencing (WGS) data. We show that gVAMP accurately localizes associations to variants with the correct frequency and position in the DNA, outperforming existing fine-mapping methods in selecting the appropriate genetic variants within WGS data. We then apply gVAMP to jointly model the relationship of tens of millions of WGS variants with human height in hundreds of thousands of UK Biobank individuals. We identify 59 rare variants and gene burden scores alongside many hundreds of DNA regions containing common variant associations and show that understanding the genetic basis of complex traits will require the joint analysis of hundreds of millions of variables measured on millions of people. The polygenic risk scores obtained from gVAMP have high accuracy (including a prediction accuracy of ∼46% for human height) and outperform current methods for downstream tasks such as mixed linear model association testing across 13 UK Biobank traits. In conclusion, gVAMP offers a scalable foundation for a wider range of analyses in WGS data.},
  author       = {Depope, Al and Bajzik, Jakub and Mondelli, Marco and Robinson, Matthew Richard},
  issn         = {2666-979X},
  journal      = {Cell Genomics},
  publisher    = {Elsevier},
  title        = {{Joint modeling of whole-genome sequencing data for human height via approximate message passing}},
  doi          = {10.1016/j.xgen.2026.101162},
  year         = {2026},
}

@article{19023,
  abstract     = {Alcohol consumption is an important risk factor for multiple diseases. It is typically assessed via self-report, which is open to measurement error through recall bias. Instead, molecular data such as blood-based DNA methylation (DNAm) could be used to derive a more objective measure of alcohol consumption by incorporating information from cytosine-phosphate-guanine (CpG) sites known to be linked to the trait. Here, we explore the epigenetic architecture of self-reported weekly units of alcohol consumption in the Generation Scotland study. We first create a blood-based epigenetic score (EpiScore) of alcohol consumption using elastic net penalized linear regression. We explore the effect of pre-filtering for CpG features ahead of elastic net, as well as differential patterns by sex and by units consumed in the last week relative to an average week. The final EpiScore was trained on 16,717 individuals and tested in four external cohorts: the Lothian Birth Cohorts (LBC) of 1921 and 1936, the Sister Study, and the Avon Longitudinal Study of Parents and Children (total N across studies > 10,000). The maximum Pearson correlation between the EpiScore and self-reported alcohol consumption within cohort ranged from 0.41 to 0.53. In LBC1936, higher EpiScore levels had significant associations with poorer global brain imaging metrics, whereas self-reported alcohol consumption did not. Finally, we identified two novel CpG loci via a Bayesian penalized regression epigenome-wide association study of alcohol consumption. Together, these findings show how DNAm can objectively characterize patterns of alcohol consumption that associate with brain health, unlike self-reported estimates.},
  author       = {Bernabeu, Elena and Chybowska, Aleksandra D. and Kresovich, Jacob K. and Suderman, Matthew and Mccartney, Daniel L. and Hillary, Robert F. and Corley, Janie and Valdés-Hernández, Maria Del C. and Maniega, Susana Muñoz and Bastin, Mark E. and Wardlaw, Joanna M. and Xu, Zongli and Sandler, Dale P. and Campbell, Archie and Harris, Sarah E. and Mcintosh, Andrew M. and Taylor, Jack A. and Yousefi, Paul and Cox, Simon R. and Evans, Kathryn L. and Robinson, Matthew Richard and Vallejos, Catalina A. and Marioni, Riccardo E.},
  issn         = {1868-7083},
  journal      = {Clinical Epigenetics},
  publisher    = {Springer Nature},
  title        = {{Blood-based epigenome-wide association study and prediction of alcohol consumption}},
  doi          = {10.1186/s13148-025-01818-y},
  volume       = {17},
  year         = {2025},
}

@inproceedings{17147,
  abstract     = {Efficient utilization of large-scale biobank data is crucial for inferring the genetic basis of disease and predicting health outcomes from the DNA. Yet we lack efficient, accurate methods that scale to data where electronic health records are linked to whole genome sequence information. To address this issue, our paper develops a new algorithmic paradigm based on Approximate Message Passing (AMP), which is specifically tailored for genomic prediction and association testing. Our method yields comparable out-of-sample prediction accuracy to the state of the art on UK Biobank traits, whilst dramatically improving computational complexity, with a 8x-speed up in the run time. In addition, AMP theory provides a joint association testing framework, which outperforms the currently used REGENIE method, in roughly a third of the compute time. This first, truly large-scale application of the AMP framework lays the foundations for a far wider range of statistical analyses for hundreds of millions of variables measured on millions of people.},
  author       = {Depope, Al and Mondelli, Marco and Robinson, Matthew Richard},
  booktitle    = {2024 IEEE International Conference on Acoustics, Speech, and Signal Processing},
  isbn         = {9798350344851},
  issn         = {1520-6149},
  location     = {Seoul, Korea},
  pages        = {13151--13155},
  publisher    = {IEEE},
  title        = {{Inference of genetic effects via approximate message passing}},
  doi          = {10.1109/ICASSP48485.2024.10447198},
  year         = {2024},
}

@phdthesis{18642,
  abstract     = {This thesis consists of two pieces of work in the broader feld of computational biology,
both of which are methods for the analysis of large scale biological data, implemented in
efcient software.
Chapter 2 introduces a statistical software for causal discovery and inference from observed
genetic marker and phenotypic trait data. We explore in simulation how well the method
can fne-map genetic efects, fnd the correct causal structure among tens of traits and
millions of genetic markers, and infer the causal efect size for the discovered causal
relations. We then apply the method to 8 million markers and 17 traits from the UK
Biobank and show that many relationships found with other methods are likely due to
the efects of hidden confounders.
Chapter 3 describes how this method can be applied to longitudinal data. I show how one
can incorporate the background knowledge present in the known order of measurements to
improve the accuracy of the causal discovery process, and explore the method’s ability to
identify age specifc genetic efects, and how the error rates of this recovery are infuenced
by missing data due to diferent censoring mechanisms.
Chapter 4 introduces a statistical software for the comparison of chromatin contact maps
based on the structural similarity index. We explore the robustness of the method to
noise and size diferences of the compared maps, show how it can measure evolutionary
conservation of topological features by providing a similarity ranking of syntenic regions,
and fnally how it can detect alterations in 3D genome structure due to genetic mutations
in samples of medical relevance.
},
  author       = {Machnik, Nick N},
  issn         = {2663-337X},
  pages        = {138},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Algorithms for causal learning and comparative analysis for genomic data}},
  doi          = {10.15479/at:ista:18642},
  year         = {2024},
}

@unpublished{18648,
  abstract     = {Statistical causal learning in genomics relies on the instrumental variable method of
Mendelian Randomization (MR). Currently, an overwhelming number of MR studies
purport to show causal relationships among a wide range of risk factors and outcomes.
Here, we show that selecting instrument variables from genome-wide association study
estimates leads to high false discovery rates for many MR approaches, which can be
greatly reduced by employing a graphical inference approach which: (i) explicitly tests
instrumental variable assumptions; (ii) distinguishes direct from indirect factors in very
high-dimensional data; (iii) discriminates pleiotropic from trait-specific markers, controlling for LD genome-wide; (iv) accommodates rare variants and binary outcomes in a
principled way; and (v) identifies potential unobserved latent confounding. For 17 traits
and 8.4M variants recorded for 458,747 individuals in the UK Biobank, we show that
standard MR analysis gives an abundance of findings that disappear under stringent
assumption checks, with many relationships reflecting potential unmeasured confounding. This implies that mixtures of temporal precedence and potential for reverse-causality
prohibit understanding the underlying nature of phenotypic and genetic correlations in
biobank data. We propose that well-curated longitudinal records are likely needed and
that our approach provides a first-step toward robust principled screening for potential
causal links.
},
  author       = {Machnik, Nick N and Mahmoudi, Seyed Mahdi and Borczyk, Malgorzata and Krätschmer, Ilse and Bauer, Markus J. and Robinson, Matthew Richard},
  booktitle    = {bioRxiv},
  title        = {{Causal inference for multiple risk factors and diseases from genomics data}},
  doi          = {10.1101/2023.12.06.570392},
  year         = {2024},
}

@article{12142,
  abstract     = {Theory for liability-scale models of the underlying genetic basis of complex disease provides an important way to interpret, compare, and understand results generated from biological studies. In particular, through estimation of the liability-scale heritability (LSH), liability models facilitate an understanding and comparison of the relative importance of genetic and environmental risk factors that shape different clinically important disease outcomes. Increasingly, large-scale biobank studies that link genetic information to electronic health records, containing hundreds of disease diagnosis indicators that mostly occur infrequently within the sample, are becoming available. Here, we propose an extension of the existing liability-scale model theory suitable for estimating LSH in biobank studies of low-prevalence disease. In a simulation study, we find that our derived expression yields lower mean square error (MSE) and is less sensitive to prevalence misspecification as compared to previous transformations for diseases with  =< 2% population prevalence and LSH of =< 0.45, especially if the biobank sample prevalence is less than that of the wider population. Applying our expression to 13 diagnostic outcomes of  =< 3% prevalence in the UK Biobank study revealed important differences in LSH obtained from the different theoretical expressions that impact the conclusions made when comparing LSH across disease outcomes. This demonstrates the importance of careful consideration for estimation and prediction of low-prevalence disease outcomes and facilitates improved inference of the underlying genetic basis of  =< 2% population prevalence diseases, especially where biobank sample ascertainment results in a healthier sample population.},
  author       = {Ojavee, Sven E. and Kutalik, Zoltan and Robinson, Matthew Richard},
  issn         = {0002-9297},
  journal      = {The American Journal of Human Genetics},
  keywords     = {Genetics (clinical), Genetics},
  number       = {11},
  pages        = {2009--2017},
  publisher    = {Elsevier},
  title        = {{Liability-scale heritability estimation for biobank studies of low-prevalence disease}},
  doi          = {10.1016/j.ajhg.2022.09.011},
  volume       = {109},
  year         = {2022},
}

@article{10702,
  abstract     = {Background: Blood-based markers of cognitive functioning might provide an accessible way to track neurodegeneration years prior to clinical manifestation of cognitive impairment and dementia. Results: Using blood-based epigenome-wide analyses of general cognitive function, we show that individual differences in DNA methylation (DNAm) explain 35.0% of the variance in general cognitive function (g). A DNAm predictor explains ~4% of the variance, independently of a polygenic score, in two external cohorts. It also associates with circulating levels of neurology- and inflammation-related proteins, global brain imaging metrics, and regional cortical volumes. Conclusions: As sample sizes increase, the ability to assess cognitive function from DNAm data may be informative in settings where cognitive testing is unreliable or unavailable.},
  author       = {McCartney, Daniel L. and Hillary, Robert F. and Conole, Eleanor L.S. and Banos, Daniel Trejo and Gadd, Danni A. and Walker, Rosie M. and Nangle, Cliff and Flaig, Robin and Campbell, Archie and Murray, Alison D. and Maniega, Susana Muñoz and Valdés-Hernández, María Del C. and Harris, Mathew A. and Bastin, Mark E. and Wardlaw, Joanna M. and Harris, Sarah E. and Porteous, David J. and Tucker-Drob, Elliot M. and McIntosh, Andrew M. and Evans, Kathryn L. and Deary, Ian J. and Cox, Simon R. and Robinson, Matthew Richard and Marioni, Riccardo E.},
  issn         = {1474-760X},
  journal      = {Genome Biology},
  number       = {1},
  publisher    = {Springer Nature},
  title        = {{Blood-based epigenome-wide analyses of cognitive abilities}},
  doi          = {10.1186/s13059-021-02596-5},
  volume       = {23},
  year         = {2022},
}

@article{8430,
  abstract     = {While recent advancements in computation and modelling have improved the analysis of complex traits, our understanding of the genetic basis of the time at symptom onset remains limited. Here, we develop a Bayesian approach (BayesW) that provides probabilistic inference of the genetic architecture of age-at-onset phenotypes in a sampling scheme that facilitates biobank-scale time-to-event analyses. We show in extensive simulation work the benefits BayesW provides in terms of number of discoveries, model performance and genomic prediction. In the UK Biobank, we find many thousands of common genomic regions underlying the age-at-onset of high blood pressure (HBP), cardiac disease (CAD), and type-2 diabetes (T2D), and for the genetic basis of onset reflecting the underlying genetic liability to disease. Age-at-menopause and age-at-menarche are also highly polygenic, but with higher variance contributed by low frequency variants. Genomic prediction into the Estonian Biobank data shows that BayesW gives higher prediction accuracy than other approaches.},
  author       = {Ojavee, Sven E and Kousathanas, Athanasios and Trejo Banos, Daniel and Orliac, Etienne J and Patxot, Marion and Lall, Kristi and Magi, Reedik and Fischer, Krista and Kutalik, Zoltan and Robinson, Matthew Richard},
  issn         = {2041-1723},
  journal      = {Nature Communications},
  number       = {1},
  publisher    = {Nature Research},
  title        = {{Genomic architecture and prediction of censored time-to-event phenotypes with a Bayesian genome-wide analysis}},
  doi          = {10.1038/s41467-021-22538-w},
  volume       = {12},
  year         = {2021},
}