@article{18525,
  abstract     = {As their statistical power grows, genome-wide association studies (GWAS) have identified an increasing number of loci underlying quantitative traits of interest. These loci are scattered throughout the genome and are individually responsible only for small fractions of the total heritable trait variance. The recently proposed omnigenic model provides a conceptual framework to explain these observations by postulating that numerous distant loci contribute to each complex trait via effect propagation through intracellular regulatory networks. We formalize this conceptual framework by proposing the “quantitative omnigenic model” (QOM), a statistical model that combines prior knowledge of the regulatory network topology with genomic data. By applying our model to gene expression traits in yeast, we demonstrate that QOM achieves similar gene expression prediction performance to traditional GWAS with hundreds of times less parameters, while simultaneously extracting candidate causal and quantitative chains of effect propagation through the regulatory network for every individual gene. We estimate the fraction of heritable trait variance in cis- and in trans-, break the latter down by effect propagation order, assess the trans- variance not attributable to transcriptional regulation, and show that QOM correctly accounts for the low-dimensional structure of gene expression covariance. We furthermore demonstrate the relevance of QOM for systems biology, by employing it as a statistical test for the quality of regulatory network reconstructions, and linking it to the propagation of nontranscriptional (including environmental) effects.},
  author       = {Ruzickova, Natalia and Hledik, Michal and Tkačik, Gašper},
  issn         = {1091-6490},
  journal      = {Proceedings of the National Academy of Sciences of the United States of America},
  number       = {44},
  publisher    = {National Academy of Sciences},
  title        = {{Quantitative omnigenic model discovers interpretable genome-wide associations}},
  doi          = {10.1073/pnas.2402340121},
  volume       = {121},
  year         = {2024},
}

@phdthesis{15020,
  abstract     = {This thesis consists of four distinct pieces of work within theoretical biology, with two themes in common: the concept of optimization in biological systems, and the use of information-theoretic tools to quantify biological stochasticity and statistical uncertainty.
Chapter 2 develops a statistical framework for studying biological systems which we believe to be optimized for a particular utility function, such as retinal neurons conveying information about visual stimuli. We formalize such beliefs as maximum-entropy Bayesian priors, constrained by the expected utility. We explore how such priors aid inference of system parameters with limited data and enable optimality hypothesis testing: is the utility higher than by chance?
Chapter 3 examines the ultimate biological optimization process: evolution by natural selection. As some individuals survive and reproduce more successfully than others, populations evolve towards fitter genotypes and phenotypes. We formalize this as accumulation of genetic information, and use population genetics theory to study how much such information can be accumulated per generation and maintained in the face of random mutation and genetic drift. We identify the population size and fitness variance as the key quantities that control information accumulation and maintenance.
Chapter 4 reuses the concept of genetic information from Chapter 3, but from a different perspective: we ask how much genetic information organisms actually need, in particular in the context of gene regulation. For example, how much information is needed to bind transcription factors at correct locations within the genome? Population genetics provides us with a refined answer: with an increasing population size, populations achieve higher fitness by maintaining more genetic information. Moreover, regulatory parameters experience selection pressure to optimize the fitness-information trade-off, i.e. minimize the information needed for a given fitness. This provides an evolutionary derivation of the optimization priors introduced in Chapter 2.
Chapter 5 proves an upper bound on mutual information between a signal and a communication channel output (such as neural activity). Mutual information is an important utility measure for biological systems, but its practical use can be difficult due to the large dimensionality of many biological channels. Sometimes, a lower bound on mutual information is computed by replacing the high-dimensional channel outputs with decodes (signal estimates). Our result provides a corresponding upper bound, provided that the decodes are the maximum posterior estimates of the signal.},
  author       = {Hledik, Michal},
  issn         = {2663-337X},
  keywords     = {Theoretical biology, Optimality, Evolution, Information},
  pages        = {158},
  publisher    = {Institute of Science and Technology Austria},
  title        = {{Genetic information and biological optimization}},
  doi          = {10.15479/at:ista:15020},
  year         = {2024},
}

@article{12081,
  abstract     = {Selection accumulates information in the genome—it guides stochastically evolving populations toward states (genotype frequencies) that would be unlikely under neutrality. This can be quantified as the Kullback–Leibler (KL) divergence between the actual distribution of genotype frequencies and the corresponding neutral distribution. First, we show that this population-level information sets an upper bound on the information at the level of genotype and phenotype, limiting how precisely they can be specified by selection. Next, we study how the accumulation and maintenance of information is limited by the cost of selection, measured as the genetic load or the relative fitness variance, both of which we connect to the control-theoretic KL cost of control. The information accumulation rate is upper bounded by the population size times the cost of selection. This bound is very general, and applies across models (Wright–Fisher, Moran, diffusion) and to arbitrary forms of selection, mutation, and recombination. Finally, the cost of maintaining information depends on how it is encoded: Specifying a single allele out of two is expensive, but one bit encoded among many weakly specified loci (as in a polygenic trait) is cheap.},
  author       = {Hledik, Michal and Barton, Nicholas H and Tkačik, Gašper},
  issn         = {1091-6490},
  journal      = {Proceedings of the National Academy of Sciences of the United States of America},
  number       = {36},
  publisher    = {National Academy of Sciences},
  title        = {{Accumulation and maintenance of information in evolution}},
  doi          = {10.1073/pnas.2123152119},
  volume       = {119},
  year         = {2022},
}

@article{9816,
  abstract     = {Aims: Mass antigen testing programs have been challenged because of an alleged insufficient specificity, leading to a large number of false positives. The objective of this study is to derive a lower bound of the specificity of the SD Biosensor Standard Q Ag-Test in large scale practical use.
Methods: Based on county data from the nationwide tests for SARS-CoV-2 in Slovakia between 31.10.–1.11. 2020 we calculate a lower confidence bound for the specificity. As positive test results were not systematically verified by PCR tests, we base the lower bound on a worst case assumption, assuming all positives to be false positives.
Results: 3,625,332 persons from 79 counties were tested. The lowest positivity rate was observed in the county of Rožňava where 100 out of 34307 (0.29%) tests were positive. This implies a test specificity of at least 99.6% (97.5% one-sided lower confidence bound, adjusted for multiplicity).
Conclusion: The obtained lower bound suggests a higher specificity compared to earlier studies in spite of the underlying worst case assumption and the application in a mass testing setting. The actual specificity is expected to exceed 99.6% if the prevalence in the respective regions was non-negligible at the time of testing. To our knowledge, this estimate constitutes the first bound obtained from large scale practical use of an antigen test.},
  author       = {Hledik, Michal and Polechova, Jitka and Beiglböck, Mathias and Herdina, Anna Nele and Strassl, Robert and Posch, Martin},
  issn         = {1932-6203},
  journal      = {PLoS ONE},
  number       = {7},
  publisher    = {Public Library of Science},
  title        = {{Analysis of the specificity of a COVID-19 antigen test in the Slovak mass testing program}},
  doi          = {10.1371/journal.pone.0255267},
  volume       = {16},
  year         = {2021},
}

@article{7553,
  abstract     = {Normative theories and statistical inference provide complementary approaches for the study of biological systems. A normative theory postulates that organisms have adapted to efficiently solve essential tasks, and proceeds to mathematically work out testable consequences of such optimality; parameters that maximize the hypothesized organismal function can be derived ab initio, without reference to experimental data. In contrast, statistical inference focuses on efficient utilization of data to learn model parameters, without reference to any a priori notion of biological function, utility, or fitness. Traditionally, these two approaches were developed independently and applied separately. Here we unify them in a coherent Bayesian framework that embeds a normative theory into a family of maximum-entropy “optimization priors.” This family defines a smooth interpolation between a data-rich inference regime (characteristic of “bottom-up” statistical models), and a data-limited ab inito prediction regime (characteristic of “top-down” normative theory). We demonstrate the applicability of our framework using data from the visual cortex, and argue that the flexibility it affords is essential to address a number of fundamental challenges relating to inference and prediction in complex, high-dimensional biological problems.},
  author       = {Mlynarski, Wiktor F and Hledik, Michal and Sokolowski, Thomas R and Tkačik, Gašper},
  journal      = {Neuron},
  number       = {7},
  pages        = {1227--1241.e5},
  publisher    = {Cell Press},
  title        = {{Statistical analysis and optimality of neural systems}},
  doi          = {10.1016/j.neuron.2021.01.020},
  volume       = {109},
  year         = {2021},
}

@inproceedings{7606,
  abstract     = {We derive a tight lower bound on equivocation (conditional entropy), or equivalently a tight upper bound on mutual information between a signal variable and channel outputs. The bound is in terms of the joint distribution of the signals and maximum a posteriori decodes (most probable signals given channel output). As part of our derivation, we describe the key properties of the distribution of signals, channel outputs and decodes, that minimizes equivocation and maximizes mutual information. This work addresses a problem in data analysis, where mutual information between signals and decodes is sometimes used to lower bound the mutual information between signals and channel outputs. Our result provides a corresponding upper bound.},
  author       = {Hledik, Michal and Sokolowski, Thomas R and Tkačik, Gašper},
  booktitle    = {IEEE Information Theory Workshop, ITW 2019},
  isbn         = {9781538669006},
  location     = {Visby, Sweden},
  publisher    = {IEEE},
  title        = {{A tight upper bound on mutual information}},
  doi          = {10.1109/ITW44776.2019.8989292},
  year         = {2019},
}

