@article{VaidChanChaudharyetal.2021, author = {Vaid, Akhil and Chan, Lili and Chaudhary, Kumardeep and Jaladanki, Suraj K. and Paranjpe, Ishan and Russak, Adam J. and Kia, Arash and Timsina, Prem and Levin, Matthew A. and He, John Cijiang and B{\"o}ttinger, Erwin and Charney, Alexander W. and Fayad, Zahi A. and Coca, Steven G. and Glicksberg, Benjamin S. and Nadkarni, Girish N.}, title = {Predictive approaches for acute dialysis requirement and death in COVID-19}, series = {Clinical journal of the American Society of Nephrology : CJASN}, volume = {16}, journal = {Clinical journal of the American Society of Nephrology : CJASN}, number = {8}, publisher = {American Society of Nephrology}, address = {Washington}, organization = {MSCIC}, issn = {1555-9041}, doi = {10.2215/CJN.17311120}, pages = {1158 -- 1168}, year = {2021}, abstract = {Background and objectives AKI treated with dialysis initiation is a common complication of coronavirus disease 2019 (COVID-19) among hospitalized patients. However, dialysis supplies and personnel are often limited. Design, setting, participants, \& measurements Using data from adult patients hospitalized with COVID-19 from five hospitals from theMount Sinai Health System who were admitted between March 10 and December 26, 2020, we developed and validated several models (logistic regression, Least Absolute Shrinkage and Selection Operator (LASSO), random forest, and eXtreme GradientBoosting [XGBoost; with and without imputation]) for predicting treatment with dialysis or death at various time horizons (1, 3, 5, and 7 days) after hospital admission. Patients admitted to theMount Sinai Hospital were used for internal validation, whereas the other hospitals formed part of the external validation cohort. Features included demographics, comorbidities, and laboratory and vital signs within 12 hours of hospital admission. Results A total of 6093 patients (2442 in training and 3651 in external validation) were included in the final cohort. Of the different modeling approaches used, XGBoost without imputation had the highest area under the receiver operating characteristic (AUROC) curve on internal validation (range of 0.93-0.98) and area under the precisionrecall curve (AUPRC; range of 0.78-0.82) for all time points. XGBoost without imputation also had the highest test parameters on external validation (AUROC range of 0.85-0.87, and AUPRC range of 0.27-0.54) across all time windows. XGBoost without imputation outperformed all models with higher precision and recall (mean difference in AUROC of 0.04; mean difference in AUPRC of 0.15). Features of creatinine, BUN, and red cell distribution width were major drivers of the model's prediction. Conclusions An XGBoost model without imputation for prediction of a composite outcome of either death or dialysis in patients positive for COVID-19 had the best performance, as compared with standard and other machine learning models.}, language = {en} } @article{DellepianeVaidJaladankietal.2021, author = {Dellepiane, Sergio and Vaid, Akhil and Jaladanki, Suraj K. and Coca, Steven and Fayad, Zahi A. and Charney, Alexander W. and B{\"o}ttinger, Erwin and He, John Cijiang and Glicksberg, Benjamin S. and Chan, Lili and Nadkarni, Girish}, title = {Acute kidney injury in patients hospitalized with COVID-19 in New York City}, series = {Kidney medicine}, volume = {3}, journal = {Kidney medicine}, number = {5}, publisher = {Elsevier}, address = {Amsterdam}, issn = {2590-0595}, doi = {10.1016/j.xkme.2021.06.008}, pages = {877 -- 879}, year = {2021}, language = {en} } @article{DattaSachsFreitasdaCruzetal.2021, author = {Datta, Suparno and Sachs, Jan Philipp and Freitas da Cruz, Harry and Martensen, Tom and Bode, Philipp and Morassi Sasso, Ariane and Glicksberg, Benjamin S. and B{\"o}ttinger, Erwin}, title = {FIBER}, series = {JAMIA open}, volume = {4}, journal = {JAMIA open}, number = {3}, publisher = {Oxford Univ. Press}, address = {Oxford}, issn = {2574-2531}, doi = {10.1093/jamiaopen/ooab048}, pages = {10}, year = {2021}, abstract = {Objectives: The development of clinical predictive models hinges upon the availability of comprehensive clinical data. Tapping into such resources requires considerable effort from clinicians, data scientists, and engineers. Specifically, these efforts are focused on data extraction and preprocessing steps required prior to modeling, including complex database queries. A handful of software libraries exist that can reduce this complexity by building upon data standards. However, a gap remains concerning electronic health records (EHRs) stored in star schema clinical data warehouses, an approach often adopted in practice. In this article, we introduce the FlexIBle EHR Retrieval (FIBER) tool: a Python library built on top of a star schema (i2b2) clinical data warehouse that enables flexible generation of modeling-ready cohorts as data frames. Materials and Methods: FIBER was developed on top of a large-scale star schema EHR database which contains data from 8 million patients and over 120 million encounters. To illustrate FIBER's capabilities, we present its application by building a heart surgery patient cohort with subsequent prediction of acute kidney injury (AKI) with various machine learning models. Results: Using FIBER, we were able to build the heart surgery cohort (n = 12 061), identify the patients that developed AKI (n = 1005), and automatically extract relevant features (n = 774). Finally, we trained machine learning models that achieved area under the curve values of up to 0.77 for this exemplary use case. Conclusion: FIBER is an open-source Python library developed for extracting information from star schema clinical data warehouses and reduces time-to-modeling, helping to streamline the clinical modeling process.}, language = {en} } @article{DeFreitasJohnsonGoldenetal.2021, author = {De Freitas, Jessica K. and Johnson, Kipp W. and Golden, Eddye and Nadkarni, Girish N. and Dudley, Joel T. and B{\"o}ttinger, Erwin and Glicksberg, Benjamin S. and Miotto, Riccardo}, title = {Phe2vec}, series = {Patterns}, volume = {2}, journal = {Patterns}, number = {9}, publisher = {Elsevier}, address = {Amsterdam}, issn = {2666-3899}, doi = {10.1016/j.patter.2021.100337}, pages = {9}, year = {2021}, abstract = {Robust phenotyping of patients from electronic health records (EHRs) at scale is a challenge in clinical informatics. Here, we introduce Phe2vec, an automated framework for disease phenotyping from EHRs based on unsupervised learning and assess its effectiveness against standard rule-based algorithms from Phenotype KnowledgeBase (PheKB). Phe2vec is based on pre-computing embeddings of medical concepts and patients' clinical history. Disease phenotypes are then derived from a seed concept and its neighbors in the embedding space. Patients are linked to a disease if their embedded representation is close to the disease phenotype. Comparing Phe2vec and PheKB cohorts head-to-head using chart review, Phe2vec performed on par or better in nine out of ten diseases. Differently from other approaches, it can scale to any condition and was validated against widely adopted expert-based standards. Phe2vec aims to optimize clinical informatics research by augmenting current frameworks to characterize patients by condition and derive reliable disease cohorts.}, language = {en} }