How to cite item

Value of the DeepSeek-R1 large language model in extracting structured data from magnetic resonance imaging reports of rectal cancer and assisting in tumor staging

  
@article{QIMS155645,
	author = {Fan Xie and Li-Zhu Ouyang and Bao-Liang Guo and Xi-Yi Huang and Zi-Wei Liu and Lan-Ni Zhou and Jia-Ling Pan and Li-Wen Wang and Ming Chen and Yun-Jing Li and Qiong-Qi Lin and Xin-Jie Chen and Qiu-Gen Hu and Fu-Sheng Ouyang},
	title = {Value of the DeepSeek-R1 large language model in extracting structured data from magnetic resonance imaging reports of rectal cancer and assisting in tumor staging},
	journal = {Quantitative Imaging in Medicine and Surgery},
	volume = {16},
	number = {7},
	year = {2026},
	keywords = {},
	abstract = {Background: Structured magnetic resonance imaging (MRI) reports improve rectal cancer diagnosis and treatment management, whereas free-text reports better describe complex MRI features, leading to divergent preferences and inconsistent acceptance among radiologists. This study aimed to evaluate the potential of DeepSeek-R1 to assist in the analysis of MRI reports for rectal cancer.Methods: This retrospective study analyzed 465 MRI reports of rectal cancer. Sixty reports were used to facilitate structured information extraction, refine reporting reminders and complete preliminary screening of large language models (LLMs). LLMs from the same developers were compared [i.e., GPT-4 vs. GPT-4o, Wenxinyiyan-4 vs. Wenxinyiyan-4Turbo, and DeepSeek-R1-7B vs. DeepSeek-R1-32B, vs. DeepSeek-R1-671B (DSR1-671B)], and the accuracy and average processing time of the LLMs were evaluated with paired-samples t-tests, one-way repeated-measures analysis of variance, McNemar tests, and Cochran Q tests. The top-performing model was selected for tumor-node (TN) staging determination. Both a five-point Likert scale and accuracy were employed as performance metrics for evaluating two prompting strategies (default knowledge and in-context knowledge) in TN staging determination, and any confabulations were documented. To assess reproducibility, the LLM analysis was independently repeated three times for all reports. Radiologists’ consensus interpretation of the report text served as the reference standard.Results: Five of the LLMs evaluated, including DSR1-671B, Wenxinyiyan-4, Wenxinyiyan-4Turbo, GPT-4, and GPT-4o, were identified in the preliminary screening by the same developers. All five LLMs processed reports significantly faster than did radiologists (time per report: 22.4–84.6 vs. 132.1 s; P},
	issn = {2223-4306},	url = {https://qims.amegroups.org/article/view/155645}
}