{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T17:38:09Z","timestamp":1775669889005,"version":"3.50.1"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T00:00:00Z","timestamp":1764201600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T00:00:00Z","timestamp":1767052800000},"content-version":"vor","delay-in-days":33,"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/100009560","name":"American Gastroenterological Association","doi-asserted-by":"crossref","award":["AGA2023-32-06"],"award-info":[{"award-number":["AGA2023-32-06"]}],"id":[{"id":"10.13039\/100009560","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["npj Digit. Med."],"DOI":"10.1038\/s41746-025-02174-0","type":"journal-article","created":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T14:54:45Z","timestamp":1764255285000},"update-policy":"https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Benchmarking proprietary and open-source language and vision-language models for gastroenterology clinical reasoning"],"prefix":"10.1038","volume":"8","author":[{"given":"Seyed Amir Ahmad","family":"Safavi-Naini","sequence":"first","affiliation":[]},{"given":"Shuhaib","family":"Ali","sequence":"additional","affiliation":[]},{"given":"Omer","family":"Shahab","sequence":"additional","affiliation":[]},{"given":"Zahra","family":"Shahhoseini","sequence":"additional","affiliation":[]},{"given":"Thomas","family":"Savage","sequence":"additional","affiliation":[]},{"given":"Sara","family":"Rafiee","sequence":"additional","affiliation":[]},{"given":"Jamil S.","family":"Samaan","sequence":"additional","affiliation":[]},{"given":"Reem","family":"Al Shabeeb","sequence":"additional","affiliation":[]},{"given":"Farah","family":"Ladak","sequence":"additional","affiliation":[]},{"given":"Jamie O.","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Juan","family":"Echavarria","sequence":"additional","affiliation":[]},{"given":"Sumbal","family":"Babar","sequence":"additional","affiliation":[]},{"given":"Aasma","family":"Shaukat","sequence":"additional","affiliation":[]},{"given":"Samuel","family":"Margolis","sequence":"additional","affiliation":[]},{"given":"Nicholas P.","family":"Tatonetti","sequence":"additional","affiliation":[]},{"given":"Girish","family":"Nadkarni","sequence":"additional","affiliation":[]},{"given":"Bara","family":"El Kurdi","sequence":"additional","affiliation":[]},{"given":"Ali","family":"Soroush","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,27]]},"reference":[{"key":"2174_CR1","doi-asserted-by":"publisher","first-page":"2276","DOI":"10.14309\/ajg.0000000000002397","volume":"118","author":"JB Henson","year":"2023","unstructured":"Henson, J. B., Glissen Brown, J. R., Lee, J. P., Patel, A. & Leiman, D. A. Evaluation of the potential utility of an artificial intelligence chatbot in gastroesophageal reflux disease management. Am. J. Gastroenterol. 118, 2276\u20132279 (2023).","journal-title":"Am. J. Gastroenterol."},{"key":"2174_CR2","doi-asserted-by":"publisher","first-page":"S-882","DOI":"10.1016\/S0016-5085(24)02511-3","volume":"166","author":"SAA Safavi-Naini","year":"2024","unstructured":"Safavi-Naini, S. A. A. et al. Su1962 GI-COPILOT: AUGMENTING CHATGPT WITH GUIDELINE-BASED KNOWLEDGE. Gastroenterology 166, S-882\u2013S-883 (2024).","journal-title":"Gastroenterology"},{"key":"2174_CR3","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1038\/s43856-023-00370-1","volume":"3","author":"J Clusmann","year":"2023","unstructured":"Clusmann, J. et al. The future landscape of large language models in medicine. Commun. Med. 3, 141 (2023).","journal-title":"Commun. Med."},{"key":"2174_CR4","unstructured":"Liang, P. et al. Holistic evaluation of language models. Trans. Mach. Learn. Res. (2023)."},{"key":"2174_CR5","doi-asserted-by":"publisher","first-page":"175628482312186","DOI":"10.1177\/17562848231218618","volume":"16","author":"E Klang","year":"2023","unstructured":"Klang, E., Sourosh, A., Nadkarni, G. N., Sharif, K. & Lahat, A. Evaluating the role of ChatGPT in gastroenterology: a comprehensive systematic review of applications, benefits, and limitations. Ther. Adv. Gastroenterol. 16, 17562848231218618 (2023).","journal-title":"Ther. Adv. Gastroenterol."},{"key":"2174_CR6","doi-asserted-by":"publisher","first-page":"943","DOI":"10.1038\/s41591-024-03423-7","volume":"31","author":"K Singhal","year":"2025","unstructured":"Singhal, K. et al. Toward expert-level medical question answering with large language models. Nat. Med. 31, 943\u2013950 (2025).","journal-title":"Nat. Med."},{"key":"2174_CR7","doi-asserted-by":"publisher","unstructured":"Li, W. et al. Can multiple-choice questions really be useful in detecting the abilities of LLMs? in International Conference on Language Resources and Evaluation https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.48550\/arXiv.2403.17752 (2024).","DOI":"10.48550\/arXiv.2403.17752"},{"key":"2174_CR8","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-023-43436-9","volume":"13","author":"D Brin","year":"2023","unstructured":"Brin, D. et al. Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments. Sci. Rep. 13, 16492 (2023).","journal-title":"Sci. Rep."},{"key":"2174_CR9","doi-asserted-by":"publisher","DOI":"10.2196\/45312","volume":"9","author":"A Gilson","year":"2023","unstructured":"Gilson, A. et al. How does ChatGPT perform on the United States medical licensing examination (USMLE)? The implications of large language models for medical education and knowledge assessment. JMIR Med. Educ. 9, e45312 (2023).","journal-title":"JMIR Med. Educ."},{"key":"2174_CR10","doi-asserted-by":"publisher","first-page":"e0000198","DOI":"10.1371\/journal.pdig.0000198","volume":"2","author":"TH Kung","year":"2023","unstructured":"Kung, T. H. et al. Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models. PLOS Digit. Health 2, e0000198 (2023).","journal-title":"PLOS Digit. Health"},{"key":"2174_CR11","doi-asserted-by":"publisher","DOI":"10.1016\/j.imu.2023.101314","volume":"41","author":"H Khorshidi","year":"2023","unstructured":"Khorshidi, H. et al. Application of ChatGPT in multilingual medical education: how does ChatGPT fare in 2023\u2019s Iranian residency entrance examination. Inform. Med. Unlocked 41, 101314 (2023).","journal-title":"Inform. Med. Unlocked"},{"key":"2174_CR12","doi-asserted-by":"publisher","unstructured":"Moshirfar, M., Altaf, A. W., Stoakes, I. M., Tuttle, J. J. & Hoopes, P. C. Artificial intelligence in ophthalmology: a comparative analysis of GPT-3.5, GPT-4, and human expertise in answering StatPearls questions. Cureus https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.7759\/cureus.40822 (2023)","DOI":"10.7759\/cureus.40822"},{"key":"2174_CR13","doi-asserted-by":"publisher","first-page":"589","DOI":"10.1001\/jamaophthalmol.2023.1144","volume":"141","author":"A Mihalache","year":"2023","unstructured":"Mihalache, A., Popovic, M. M. & Muni, R. H. Performance of an artificial intelligence chatbot in ophthalmic knowledge assessment. JAMA Ophthalmol. 141, 589 (2023).","journal-title":"JAMA Ophthalmol."},{"key":"2174_CR14","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1016\/j.ajo.2023.05.024","volume":"254","author":"LZ Cai","year":"2023","unstructured":"Cai, L. Z. et al. Performance of generative large language models on ophthalmology board\u2013style questions. Am. J. Ophthalmol. 254, 141\u2013149 (2023).","journal-title":"Am. J. Ophthalmol."},{"key":"2174_CR15","doi-asserted-by":"publisher","first-page":"2280","DOI":"10.14309\/ajg.0000000000002320","volume":"118","author":"K Suchman","year":"2023","unstructured":"Suchman, K., Garg, S. & Trindade, A. J. Chat generative pretrained transformer fails the multiple-choice American College of Gastroenterology self-assessment test. Am. J. Gastroenterol. 118, 2280\u20132282 (2023).","journal-title":"Am. J. Gastroenterol."},{"key":"2174_CR16","doi-asserted-by":"publisher","first-page":"465","DOI":"10.1007\/s10157-023-02451-w","volume":"28","author":"R Noda","year":"2024","unstructured":"Noda, R. et al. Performance of ChatGPT and bard in self-assessment questions for nephrology board renewal. Clin. Exp. Nephrol. 28, 465\u2013469 (2024).","journal-title":"Clin. Exp. Nephrol."},{"key":"2174_CR17","doi-asserted-by":"publisher","first-page":"279","DOI":"10.1093\/ehjdh\/ztad029","volume":"4","author":"I Skalidis","year":"2023","unstructured":"Skalidis, I. et al. ChatGPT takes on the European exam in core cardiology: an artificial intelligence success story? Eur. Heart J. Digit. Health 4, 279\u2013281 (2023).","journal-title":"Eur. Heart J. Digit. Health"},{"key":"2174_CR18","doi-asserted-by":"publisher","first-page":"722","DOI":"10.1093\/ced\/llad197","volume":"49","author":"L Passby","year":"2024","unstructured":"Passby, L., Jenko, N. & Wernham, A. Performance of ChatGPT on specialty certificate examination in dermatology multiple-choice questions. Clin. Exp. Dermatol. 49, 722\u2013727 (2024).","journal-title":"Clin. Exp. Dermatol."},{"key":"2174_CR19","doi-asserted-by":"publisher","first-page":"NP1085","DOI":"10.1093\/asj\/sjad130","volume":"43","author":"P Humar","year":"2023","unstructured":"Humar, P., Asaad, M., Bengur, F. B. & Nguyen, V. ChatGPT is equivalent to first-year plastic surgery residents: evaluation of ChatGPT on the plastic surgery In-service examination. Aesthet. Surg. J. 43, NP1085\u2013NP1089 (2023).","journal-title":"Aesthet. Surg. J."},{"key":"2174_CR20","doi-asserted-by":"publisher","first-page":"4271","DOI":"10.1007\/s00405-023-08051-4","volume":"280","author":"CC Hoch","year":"2023","unstructured":"Hoch, C. C. et al. ChatGPT\u2019s quiz skills in different otolaryngology subspecialties: an analysis of 2576 single-choice and multiple-choice board certification preparation questions. Eur. Arch. Otorhinolaryngol. 280, 4271\u20134278 (2023).","journal-title":"Eur. Arch. Otorhinolaryngol."},{"key":"2174_CR21","doi-asserted-by":"publisher","first-page":"1090","DOI":"10.1227\/neu.0000000000002551","volume":"93","author":"R Ali","year":"2023","unstructured":"Ali, R. et al. Performance of ChatGPT, GPT-4, and Google Bard on a neurosurgery oral boards preparation question bank. Neurosurgery 93, 1090\u20131098 (2023).","journal-title":"Neurosurgery"},{"key":"2174_CR22","doi-asserted-by":"publisher","first-page":"1623","DOI":"10.1097\/CORR.0000000000002704","volume":"481","author":"ZC Lum","year":"2023","unstructured":"Lum, Z. C. Can artificial intelligence pass the American Board of Orthopaedic Surgery examination? Orthopaedic residents versus ChatGPT. Clin. Orthop. 481, 1623\u20131630 (2023).","journal-title":"Clin. Orthop."},{"key":"2174_CR23","doi-asserted-by":"publisher","first-page":"1015","DOI":"10.1007\/s13042-024-02318-w","volume":"16","author":"Y Zheng","year":"2025","unstructured":"Zheng, Y. et al. Large language models for medicine: a survey. Int J. Mach. Learn Cyber. 16, 1015\u20131040 (2025).","journal-title":"Int J. Mach. Learn Cyber."},{"key":"2174_CR24","unstructured":"Optimum. Quantization. HuggingFace Website https:\/\/linproxy.fan.workers.dev:443\/https\/huggingface.co\/docs\/optimum\/en\/concept_guides\/quantization (2024)."},{"key":"2174_CR25","doi-asserted-by":"crossref","unstructured":"Zhao, W. et al. DIVKNOWQA: assessing the reasoning ability of LLMs via open-domain question answering over knowledge base and text. In Proc. North American Chapter of the Association for Computational Linguistics Findings of the Association for Computational Linguistics: NAACL 2024 51\u201368 (Association for Computational Linguistics, 2024).","DOI":"10.18653\/v1\/2024.findings-naacl.5"},{"key":"2174_CR26","doi-asserted-by":"publisher","unstructured":"Li, S. et al. Evaluating quantized large language models. In Proc. International Conference on Machine Learning https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.48550\/arXiv.2402.18158 (2024).","DOI":"10.48550\/arXiv.2402.18158"},{"key":"2174_CR27","doi-asserted-by":"publisher","unstructured":"Gong, Z. et al. What makes quantization for large language model hard? An empirical study from the lens of perturbation. In Proc. AAAI Conference on Artificial Intelligence abs\/2403.6408 18082\u201318089 arXiv https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.48550\/arXiv.2403.06408 (2024).","DOI":"10.48550\/arXiv.2403.06408"},{"key":"2174_CR28","doi-asserted-by":"publisher","unstructured":"Buckley, T., Diao, J. A., Rajpurkar, P., Rodman, A. & Manrai, A. K. Accuracy of a vision-language model on challenging medical cases. Preprint at https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.48550\/arXiv.2311.05591.","DOI":"10.48550\/arXiv.2311.05591"},{"key":"2174_CR29","doi-asserted-by":"publisher","first-page":"175628482412270","DOI":"10.1177\/17562848241227031","volume":"17","author":"O Shahab","year":"2024","unstructured":"Shahab, O., El Kurdi, B., Shaukat, A., Nadkarni, G. & Soroush, A. Large language models: a primer and gastroenterology applications. Ther. Adv. Gastroenterol. 17, 17562848241227031 (2024).","journal-title":"Ther. Adv. Gastroenterol."},{"key":"2174_CR30","unstructured":"Lin, S., Hilton, J. & Evans, O. Teaching models to express their uncertainty in words. Preprint at https:\/\/linproxy.fan.workers.dev:443\/https\/openreview.net\/forum?id=8s8K2UZGTZ (2022)."},{"key":"2174_CR31","doi-asserted-by":"crossref","unstructured":"Savage, T. et al. Large language model uncertainty proxies: discrimination and calibration for medical diagnosis and treatment. J. Am. Med. Inform. Assoc. 32, 139\u2013149 (2025).","DOI":"10.1093\/jamia\/ocae254"},{"key":"2174_CR32","unstructured":"American Board of Internal Medicine (ABIM) Official Website. Gastroenterology certification exam content. abim.org https:\/\/linproxy.fan.workers.dev:443\/https\/www.abim.org\/certification\/exam-information\/gastroenterology\/exam-content (2024)."},{"key":"2174_CR33","unstructured":"Kowalczyk, M. A step-by-step guide to prompt engineering: best practices, challenges, and examples. Lakera.ai https:\/\/linproxy.fan.workers.dev:443\/https\/www.lakera.ai\/blog\/prompt-engineering-guide (2023)."},{"key":"2174_CR34","doi-asserted-by":"publisher","DOI":"10.2196\/50638","volume":"25","author":"B Mesk\u00f3","year":"2023","unstructured":"Mesk\u00f3, B. Prompt engineering as an important emerging skill for medical professionals: tutorial. J. Med. Internet Res. 25, e50638 (2023).","journal-title":"J. Med. Internet Res."},{"key":"2174_CR35","doi-asserted-by":"publisher","unstructured":"White, J. et al. A prompt pattern catalog to enhance prompt engineering with ChatGPT. Preprint at https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.48550\/arXiv.2302.11382 (2023).","DOI":"10.48550\/arXiv.2302.11382"},{"key":"2174_CR36","unstructured":"Zhou, Y. et al. Large language models are human-level prompt engineers. In Proc. International Conference on Learning Representations. https:\/\/linproxy.fan.workers.dev:443\/https\/openreview.net\/forum?id=92gvk82DE- (2022)."},{"key":"2174_CR37","unstructured":"Dettmers, T., Pagnoni, A., Holtzman, A. & Zettlemoyer, L. QLoRA: efficient finetuning of quantized LLMs. Preprint at https:\/\/linproxy.fan.workers.dev:443\/https\/openreview.net\/forum?id=OUIFPHEgJU (2023)."},{"key":"2174_CR38","unstructured":"Zaidi, N. Modified bloom\u2019s taxonomy for evaluating multiple choice questions. Baylor College of Medicine https:\/\/linproxy.fan.workers.dev:443\/https\/www.bcm.edu\/sites\/default\/files\/2019\/04\/principles-and-guidelines-for-assessments-6.15.15.pdf (2015)."},{"key":"2174_CR39","unstructured":"Anthropic. Consumer terms of service. Anthropic Official Website https:\/\/linproxy.fan.workers.dev:443\/https\/www.anthropic.com\/legal\/consumer-terms (2024)."},{"key":"2174_CR40","unstructured":"Poe. Poe privacy center. Poe Official Website https:\/\/linproxy.fan.workers.dev:443\/https\/poe.com\/privacy_center (2024)."},{"key":"2174_CR41","unstructured":"OpenAI. Data controls FAQ. OpenAI Official Website https:\/\/linproxy.fan.workers.dev:443\/https\/help.openai.com\/en\/articles\/7730893-data-controls-faq (2024)."}],"container-title":["npj Digital Medicine"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.nature.com\/articles\/s41746-025-02174-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.nature.com\/articles\/s41746-025-02174-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.nature.com\/articles\/s41746-025-02174-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T03:26:49Z","timestamp":1767065209000},"score":1,"resource":{"primary":{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.nature.com\/articles\/s41746-025-02174-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,27]]},"references-count":41,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,12]]}},"alternative-id":["2174"],"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.1038\/s41746-025-02174-0","relation":{},"ISSN":["2398-6352"],"issn-type":[{"value":"2398-6352","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,27]]},"assertion":[{"value":"18 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 November 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"G.N. is deputy editor at\n                      npj Digital Medicine\n                      . All authors declare no financial or non-financial competing interests related to this work.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}],"article-number":"797"}}