{"author":[{"id":"09a8f98d-ec99-11ea-ae11-c063a7b7fe5f","full_name":"Frantar, Elias","last_name":"Frantar","first_name":"Elias"},{"last_name":"Castro","first_name":"Roberto","full_name":"Castro, Roberto"},{"first_name":"Jiale","last_name":"Chen","id":"4d0a9064-1ff6-11ee-9fa6-ec046c604785","full_name":"Chen, Jiale","orcid":"0000-0001-5337-5875"},{"full_name":"Hoefler, Torsten","last_name":"Hoefler","first_name":"Torsten"},{"last_name":"Alistarh","first_name":"Dan-Adrian","orcid":"0000-0003-3650-940X","id":"4A899BFC-F248-11E8-B48F-1D18A9856A87","full_name":"Alistarh, Dan-Adrian"}],"has_accepted_license":"1","type":"research_data_reference","oa_version":"Published Version","related_material":{"record":[{"status":"public","id":"19877","relation":"used_for_analysis_in"}]},"abstract":[{"text":"This is Marlin, a Mixed Auto-Regressive Linear kernel (and the name of one of the planet's fastest fish), an extremely optimized FP16xINT4 matmul kernel aimed at LLM inference that can deliver close to ideal (4x) speedups up to batchsizes of 16-32 tokens (in contrast to the 1-2 tokens of prior work with comparable speedup).\r\n\r\nAdditionally, it includes Sparse-Marlin, an extension of the MARLIN kernels adding support to 2:4 weight sparsity, achieving 5.3x speedups on NVIDIA GPUs (Ampere/Ada).","lang":"eng"}],"oa":1,"department":[{"_id":"DaAl"}],"article_processing_charge":"No","main_file_link":[{"open_access":"1","url":"https://doi.org/10.5281/ZENODO.14213091"}],"doi":"10.5281/ZENODO.14213091","user_id":"2DF688A6-F248-11E8-B48F-1D18A9856A87","month":"11","_id":"19884","tmp":{"name":"Creative Commons Attribution 4.0 International Public License (CC-BY 4.0)","legal_code_url":"https://creativecommons.org/licenses/by/4.0/legalcode","image":"/images/cc_by.png","short":"CC BY (4.0)"},"year":"2024","date_created":"2025-06-24T06:09:18Z","date_published":"2024-11-24T00:00:00Z","ddc":["000"],"OA_place":"repository","corr_author":"1","day":"24","citation":{"apa":"Frantar, E., Castro, R., Chen, J., Hoefler, T., &#38; Alistarh, D.-A. (2024). MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. Zenodo. <a href=\"https://doi.org/10.5281/ZENODO.14213091\">https://doi.org/10.5281/ZENODO.14213091</a>","mla":"Frantar, Elias, et al. <i>MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models</i>. Zenodo, 2024, doi:<a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>.","short":"E. Frantar, R. Castro, J. Chen, T. Hoefler, D.-A. Alistarh, (2024).","chicago":"Frantar, Elias, Roberto Castro, Jiale Chen, Torsten Hoefler, and Dan-Adrian Alistarh. “MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models.” Zenodo, 2024. <a href=\"https://doi.org/10.5281/ZENODO.14213091\">https://doi.org/10.5281/ZENODO.14213091</a>.","ama":"Frantar E, Castro R, Chen J, Hoefler T, Alistarh D-A. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models. 2024. doi:<a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>","ieee":"E. Frantar, R. Castro, J. Chen, T. Hoefler, and D.-A. Alistarh, “MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models.” Zenodo, 2024.","ista":"Frantar E, Castro R, Chen J, Hoefler T, Alistarh D-A. 2024. MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models, Zenodo, <a href=\"https://doi.org/10.5281/ZENODO.14213091\">10.5281/ZENODO.14213091</a>."},"status":"public","date_updated":"2025-06-24T06:15:51Z","title":"MARLIN: Mixed-precision auto-regressive parallel inference on Large Language Models","publisher":"Zenodo"}