@inproceedings{21042,
  abstract     = {Many blockchains such as Ethereum execute all incoming transactions sequentially significantly limiting the potential throughput. A common approach to scale execution is parallel execution engines that fully utilize modern multi-core architectures. Parallel execution is then either done optimistically, by executing transactions in parallel and detecting conflicts on the fly, or guided, by requiring exhaustive client transaction hints and scheduling transactions accordingly.

However, recent studies have shown that the performance of parallel execution engines depends on the nature of the underlying workload. In fact, in some cases, only a 60% speed-up compared to sequential execution could be obtained. This is the case, as transactions that access the same resources must be executed sequentially. For example, if 10% of the transactions in a block access the same resource, the execution cannot meaningfully scale beyond 10 cores. Therefore, a single popular application can bottleneck the execution and limit the potential throughput.

In this paper, we introduce Anthemius, a block construction algorithm that optimizes parallel transaction execution throughput. We evaluate Anthemius exhaustively under a range of workloads, and show that Anthemius enables the underlying parallel execution engine to process over twice as many transactions.},
  author       = {Neiheiser, Ray and Kokoris Kogias, Eleftherios},
  booktitle    = {29th International Conference on Financial Cryptography and Data Security},
  isbn         = {9783032070234},
  issn         = {1611-3349},
  location     = {Miyakojima, Japan},
  pages        = {307--323},
  publisher    = {Springer Nature},
  title        = {{Anthemius: Efficient and modular block assembly for concurrent execution}},
  doi          = {10.1007/978-3-032-07024-1_18},
  volume       = {15751},
  year         = {2026},
}

@article{21017,
  abstract     = {With the growing interest in blockchains, permissioned approaches to consensus have received increasing attention. Unfortunately, the BFT consensus algorithms that are the backbone of most of these blockchains scale poorly and offer limited throughput. In fact, many state-of-the-art BFT consensus algorithms require a single leader process to receive and validate votes from a quorum of processes and then broadcast the result, which is inherently non-scalable. Recent approaches avoid this bottleneck by using dissemination/aggregation trees to propagate values and collect and validate votes. However, the use of trees increases the round latency, which limits the throughput for deeper trees. In this paper we propose Kauri, a BFT communication abstraction that sustains high throughput as the system size grows by leveraging a novel pipelining technique to perform scalable dissemination and aggregation on trees. Furthermore, when the number of faults is moderate (arguably the most common case in practice), our construction is able to recover from faults in an optimal number of reconfiguration steps. We implemented and experimentally evaluated Kauri with up to 800 processes. Our results show that Kauri outperforms the throughput of state-of-the-art permissioned blockchain protocols, by up to 58x without compromising latency. Interestingly, in some cases, the parallelization provided by Kauri can also decrease the latency.},
  author       = {Neiheiser, Ray and Matos, Miguel and Rodrigues, Luis},
  issn         = {1557-7333},
  journal      = {ACM Transactions on Computer Systems},
  publisher    = {Association for Computing Machinery},
  title        = {{Kauri: BFT consensus with pipelined tree-based dissemination and aggregation}},
  doi          = {10.1145/3769423},
  year         = {2025},
}