@inproceedings{blankemeier2023efficient, author = {Blankemeier, Louis and Fries, Jason and Tinn, Robert and Preston, Joseph and Shah, Nigam and Chaudhari, Akshay}, title = {Efficient Diagnosis Assignment Using Unstructured Clinical Notes}, booktitle = {ACL 2023}, year = {2023}, month = {July}, abstract = {Electronic phenotyping entails using electronic health records (EHRs) to identify patients with specific health outcomes and determine when those outcomes occurred. Unstructured clinical notes, which contain a vast amount of information, are a valuable resource for electronic phenotyping. However, traditional methods, such as rule-based labeling functions or neural networks, require significant manual effort to tune and may not generalize well to multiple indications. To address these challenges, we propose \textit[HyDE] (hybrid diagnosis extractor). HyDE is a simple framework for electronic phenotyping that integrates labeling functions and a disease-agnostic neural network to assign diagnoses to patients. By training HyDE's model to correct predictions made by labeling functions, we are able to disambiguate hypertension true positives and false positives with a supervised area under the precision-recall curve (AUPRC) of 0.85. We extend this hypertension-trained model to zero-shot evaluation of four other diseases, generating AUPRC values ranging from 0.82 - 0.95 and outperforming a labeling function baseline by 44 points in F1 score and a Word2Vec baseline by 24 points in F1 score on average. Furthermore, we demonstrate a speedup of >4x by pruning the length of inputs into our language model to ~2.3\% of the full clinical notes, with negligible impact to the AUPRC. HyDE has the potential to improve the efficiency and efficacy of interpreting large-scale unstructured clinical notes for accurate EHR phenotyping.}, url = {http://approjects.co.za/?big=en-us/research/publication/efficient-diagnosis-assignment-using-unstructured-clinical-notes/}, }