{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Collect relevant structured data for all labeled patients, for the purpose of creating classifiers separate from the NLP project, to identify patient cohorts to preferrentially sample for manual concept analysis." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Code setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import all of the standard library tools" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from datetime import datetime\n", "import configparser\n", "import hashlib\n", "from importlib import reload\n", "import logging\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "import pathlib as pl\n", "import sys\n", "import yaml\n", "\n", "from IPython import display" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Imports for sqlalchemy" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sqlalchemy as sa\n", "from sqlalchemy.engine import reflection\n", "from sqlalchemy.ext.declarative import declarative_base\n", "from sqlalchemy import create_engine, MetaData, inspect\n", "from sqlalchemy.orm import sessionmaker\n", "from sqlalchemy.ext.automap import automap_base, name_for_scalar_relationship, generate_relationship" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import the ICD9 library" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sys.path.append('icd9')\n", "from icd9 import ICD9\n", "\n", "tree = ICD9('icd9/codes.json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Import our utility code" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import etc_utils as eu\n", "import mimic_extraction_utils as meu\n", "import structured_data_utils as sdu" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reload block that can be run as utility code is modified" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reload(eu)\n", "reload(meu)\n", "reload(sdu)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configure pandas and matplot lib for nice web printing" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "pd.options.display.max_rows = 1000\n", "pd.options.display.max_columns = 50\n", "pd.options.display.max_colwidth = 100" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load config files, configure logging" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "work_desc = \"gather_structured_data\"" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time string: 2017-01-02-14-57\n", "Paths:\n", "log_dir: /Users/ecarlson/logs/mit_frequent_fliers\n", "input_dir: /Users/ecarlson/data/mit_frequent_fliers\n", "results_dir: /Users/ecarlson/results/mit_frequent_fliers\n", "repo_data_dir: /Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data\n" ] } ], "source": [ "time_str, path_config, creds = eu.load_config_v2(creds_file='../../private/mit_freq_fliers/credentials.yaml')\n", "print('Time string: {}'.format(time_str))\n", "\n", "print('Paths:')\n", "for k, item in path_config.items():\n", " print('{}: {}'.format(k, item))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Logging to /Users/ecarlson/logs/mit_frequent_fliers/2017-01-02-14-57_gather_structured_data.log\n" ] } ], "source": [ "logger = logging.getLogger()\n", "\n", "eu.configure_logging(logger, work_desc=work_desc, log_directory=path_config['log_dir'], time_str=time_str)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['mimic3_v1_4', 'mimic2_v2_6', 'postgres']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[k for k in creds.keys()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Connect to database" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we'll use the credentials loaded from our config file to connect to the MIMIC postgres database" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "db_path = '{sa_driver}://{username}:{password}@{hostname}/{dbname}'.format(**creds['mimic3_v1_4'])\n", "engine = create_engine(db_path)\n", "sm = sessionmaker(bind=engine)\n", "s = sm()\n", "conn = s.connection()\n", "\n", "meta = MetaData(schema=\"mimiciii\", bind=engine)\n", "meta.reflect(bind=engine)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "SQLAlchemy has a handy database reflection capability we can use to create access points to all of the MIMIC tables" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "base = automap_base(metadata=meta)\n", "base.prepare(engine, reflect=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can quickly check out the schema it discovered and test it" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "admissions\n", "callout\n", "caregivers\n", "chartevents\n", "cptevents\n", "d_cpt\n", "d_icd_diagnoses\n", "d_icd_procedures\n", "d_items\n", "d_labitems\n", "datetimeevents\n", "diagnoses_icd\n", "drgcodes\n", "icustays\n", "inputevents_cv\n", "inputevents_mv\n", "labevents\n", "microbiologyevents\n", "noteevents\n", "outputevents\n", "patients\n", "prescriptions\n", "procedureevents_mv\n", "procedures_icd\n", "services\n", "transfers\n" ] } ], "source": [ "for cls_name in sorted(base.classes.keys()):\n", " print(cls_name)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "note_tb = base.classes['noteevents']" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2083180" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s.query(note_tb.category).count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load labeled notes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the annotation files to get the patient IDs we need to extract ICD9 data for. Also, use our patient matching code from previous notebook to find MIMIC III equivalent IDs for the patients." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "categories = ['Advanced.Cancer', 'Advanced.Heart.Disease', 'Advanced.Lung.Disease',\n", " 'Alcohol.Abuse',\n", " 'Chronic.Neurological.Dystrophies', 'Chronic.Pain.Fibromyalgia',\n", " 'Dementia', 'Depression', 'Developmental.Delay.Retardation',\n", " 'Non.Adherence', 'None',\n", " 'Obesity', 'Other.Substance.Abuse', \n", " 'Schizophrenia.and.other.Psychiatric.Disorders', 'Unsure',]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data_path = pl.Path(path_config['repo_data_dir'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[PosixPath('/Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data/AllnursingFinal27Sep16.csv'),\n", " PosixPath('/Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data/combined_label_data_2016-10-24-16-35.csv'),\n", " PosixPath('/Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data/dischargeSummariesClean.csv'),\n", " PosixPath('/Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data/mimic3_note_equivs_2016-10-22-03-39.csv'),\n", " PosixPath('/Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data/mimic3_note_metadata_2016-10-24-16-35.csv'),\n", " PosixPath('/Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data/notes_icd9_codes_2016-10-24-16-35.csv'),\n", " PosixPath('/Users/ecarlson/code/etc/mit_frequent_fliers/mit-team-code/data/nursingNotesClean.csv')]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[p for p in data_path.glob('*csv')]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nursing_notes_path = data_path.joinpath('nursingNotesClean.csv')\n", "discharge_notes_path = data_path.joinpath('dischargeSummariesClean.csv')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nursing_notes = pd.read_csv(nursing_notes_path.as_posix())\n", "disch_notes = pd.read_csv(discharge_notes_path.as_posix()).rename(columns={'subject.id':'subject_id'})" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Hospital.Admission.IDsubject_idicustay_idcharttimerealtimecategorytitletextcohortrandSubject.IDICU.IDNote.TypeChart.timeCategoryReal.timeNoneObesityNon.AdherenceDevelopmental.Delay.RetardationAdvanced.Heart.DiseaseAdvanced.Lung.DiseaseSchizophrenia.and.other.Psychiatric.DisordersAlcohol.AbuseOther.Substance.AbuseChronic.Pain.FibromyalgiaChronic.Neurological.DystrophiesAdvanced.CancerDepressionDementiaUnsureoperator
022480730800.03033-07-09 00:56:003033-07-09 01:22:00Nursing/OtherMICU/SICU NURSING PROGRESS NOTE.\\nMICU/SICU NURSING PROGRESS NOTE.\\n SEE CAREVIEW FOR OBJECTIVE DATA.\\n Neuro: Arouses w...10.3308792480730800.0Nursing/Other3033-07-09 00:56:00Nursing/Other3033-07-09 01:22:00100000000000000ETM
\n", "
" ], "text/plain": [ " Hospital.Admission.ID subject_id icustay_id charttime \\\n", "0 2 24807 30800.0 3033-07-09 00:56:00 \n", "\n", " realtime category title \\\n", "0 3033-07-09 01:22:00 Nursing/Other MICU/SICU NURSING PROGRESS NOTE. \n", "\n", " text \\\n", "0 \\nMICU/SICU NURSING PROGRESS NOTE.\\n SEE CAREVIEW FOR OBJECTIVE DATA.\\n Neuro: Arouses w... \n", "\n", " cohort rand Subject.ID ICU.ID Note.Type Chart.time \\\n", "0 1 0.330879 24807 30800.0 Nursing/Other 3033-07-09 00:56:00 \n", "\n", " Category Real.time None Obesity Non.Adherence \\\n", "0 Nursing/Other 3033-07-09 01:22:00 1 0 0 \n", "\n", " Developmental.Delay.Retardation Advanced.Heart.Disease \\\n", "0 0 0 \n", "\n", " Advanced.Lung.Disease Schizophrenia.and.other.Psychiatric.Disorders \\\n", "0 0 0 \n", "\n", " Alcohol.Abuse Other.Substance.Abuse Chronic.Pain.Fibromyalgia \\\n", "0 0 0 0 \n", "\n", " Chronic.Neurological.Dystrophies Advanced.Cancer Depression Dementia \\\n", "0 0 0 0 0 \n", "\n", " Unsure operator \n", "0 0 ETM " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "MICU/SICU NURSING PROGRESS NOTE.\n", " SEE CAREVIEW FOR OBJECTIVE DATA.\n", " Neuro: Arouses with verbal stimuli, ms is variable and changes from a&o x 3 to a&o x 1 with moaning episodes, repetitive speach and repetative motions with rue. Pupils are rt 4mm, lt 3mm and sluggish bilat. Head ct completed and was negative. Was given versed 1 mg during ct and pt became much more calm and was able to answer questions appropriatly. Reports able to feel in all 4 extrem., Partial movement of rt ue and lt ue.\n", " Respiratory: Lung sounds are coarse throughout, diminished in lt base. RR 12-24 and non-labored except when having moaning episodes. O2 saturation is 94-100% on 4l nc. Cxr in ed showed persistent lll pneumonia and atelectasis in rt upper field. Expectorating thick tan sputum in abundant amounts.\n", " CV: Sinus rythm, rate 70-98 with no ectopy noted. Nbp 104- 134 systolic. Good pulses all 4 extrem. \n", " GI/GU: Abdomen si softly distended with + bs. Pt is able to take regular meals, is full feed, but must be bolt upright. Able to take meds with custard or jello. No bm this shift but reports bm yesterday. Foley catheter is 30fr placed at os facility, occn leaks around catheter. Patetn and draing clear amber urine. ? uti, leuks and blood in ua.\n", " Integument: Multiple skin issues. Stage 3 decubiti below lt glutael, rash on lt and rty gluteals extending down to just above knee joint, raised and pink area in rt lower abdomen, blister type area on rt upper thigh. Lidocaine patches on mid thorax and lumbar spine. \n", " Plan: Monitor respiratory status and tx pnuemonia with abx, Tx uti, Assess present pain control methods and find alteratives. Obtain first step bed if to be pt here for any lenght of time.\n", "\n" ] } ], "source": [ "display.display(nursing_notes.head(1))\n", "print(nursing_notes.loc[0,'text'])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Hospital.Admission.IDsubject_idchartdatecategorydescriptionstextcohortSubject.IDICU.IDNote.TypeChart.timeCategoryReal.timeNoneObesityNon.AdherenceDevelopmental.Delay.RetardationAdvanced.Heart.DiseaseAdvanced.Lung.DiseaseSchizophrenia.and.other.Psychiatric.DisordersAlcohol.AbuseOther.Substance.AbuseChronic.Pain.FibromyalgiaChronic.Neurological.DystrophiesAdvanced.CancerDepressionDementiaUnsureoperator
0100020997310-DEC-2142 00:00:00DischargeSummary\\n\\n\\n\\nAdmission Date: [**2512-1-8**] Discharge Date: [**2512-1-18**]\\n\\nDate o...19973100020Discharge100020Discharge10-DEC-2142 00:00:00000000000110000JTW
\n", "
" ], "text/plain": [ " Hospital.Admission.ID subject_id chartdate category \\\n", "0 100020 9973 10-DEC-2142 00:00:00 Discharge \n", "\n", " descriptions \\\n", "0 Summary \n", "\n", " text \\\n", "0 \\n\\n\\n\\nAdmission Date: [**2512-1-8**] Discharge Date: [**2512-1-18**]\\n\\nDate o... \n", "\n", " cohort Subject.ID ICU.ID Note.Type Chart.time Category \\\n", "0 1 9973 100020 Discharge 100020 Discharge \n", "\n", " Real.time None Obesity Non.Adherence \\\n", "0 10-DEC-2142 00:00:00 0 0 0 \n", "\n", " Developmental.Delay.Retardation Advanced.Heart.Disease \\\n", "0 0 0 \n", "\n", " Advanced.Lung.Disease Schizophrenia.and.other.Psychiatric.Disorders \\\n", "0 0 0 \n", "\n", " Alcohol.Abuse Other.Substance.Abuse Chronic.Pain.Fibromyalgia \\\n", "0 0 0 1 \n", "\n", " Chronic.Neurological.Dystrophies Advanced.Cancer Depression Dementia \\\n", "0 1 0 0 0 \n", "\n", " Unsure operator \n", "0 0 JTW " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display.display(disch_notes.head(1))\n", "#print(disch_notes.loc[0,'text'])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "mimic3_map = pd.read_csv(data_path.joinpath('mimic3_note_equivs_2016-10-22-03-39.csv').as_posix())" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idhadm_idicustay_idcategorychartdatecharttimerealtimelengthmd5str_startstr_endrow_id_m3chartdate_m3charttime_m3storetime_m3md5_m3str_start_m3str_end_m3len_diff_pctstr_start_diffstr_end_difftotal_m3_distance
068.017608.078.0Nursing/otherNaN2728-12-13 16:51:002728-12-13 17:08:001666.027572b36bd4c26c322f50cf65d095d16NSG0700-1900Events:Attemptedeatingbreakfastbutbecaine..?doubleorsinglelumen.Continuetomonitoroutput.1261174.02173-12-162173-12-16 16:51:002173-12-16 17:08:00a71a1009932ab1fce83b8867234ec9e4NSG0700-1900Events:Attemptedeatingbreakfastbutbecaine..?doubleorsinglelumen.Continuetomonitoroutput.0.0012000.000.00.001200
1109.0164029.0NaNDischarge summary2142-12-10NaN2142-12-10 00:00:007751.0f5f69772c32f1b0ac05b7cf408f7a6db:::Sex:FService:MEDICINEAllergies:PenicillinsAttenrativeassistant,,atifyouneedtoconfirmorreschedule.15322.02140-01-21NaNNaN1770e41dbbe1a89a144bcebfad1fc87e:::Sex:FService:MEDICINEAllergies:PenicillinsAttenrativeassistant,,atifyouneedtoconfirmorreschedule.0.0025800.000.00.002580
2109.026693.0132.0Nursing/otherNaN3353-03-09 02:13:003353-03-09 02:47:002107.027d1f5907fa14b6702837a845f84c54eficunsgnote:21:30-7:00thisisa22y.o.womanadmwithlupneedtocontinuehemodialysisscheduleqtues/thurs/sat.1261605.02140-04-082140-04-08 03:13:002140-04-08 03:47:006e88c96f2f3f7dab74d3732eec27dd35nsgnote:21:30-7:00thisisa22y.o.womanadmwithlupusdxneedtocontinuehemodialysisscheduleqtues/thurs/sat.0.0317990.080.00.111799
3109.012757.0130.0Nursing/otherNaN3352-12-19 16:48:003352-12-19 17:05:001294.03e0fff775cfb678fdfa06ece68ebfab5NPN7A-7PSeecarevueforspecifics:Briefly22yroldfemalwnwhendialysiscompleted,LabetalolgttforSBP>180prn.1261597.02140-01-192140-01-19 16:48:002140-01-19 17:05:00d3350e7e8b6a9073feae2c0f11ea82e3NPN7A-7PSeecarevueforspecifics:Briefly22yroldfemalwnwhendialysiscompleted,LabetalolgttforSBP>180prn.0.0007730.000.00.000773
4109.014234.0129.0Nursing/otherNaN3351-03-18 02:36:003351-03-18 03:24:003083.08efc0a2ff698b75ce183e3183c1bf204MICUA7P-7ASHIFTSUMMARYOFEVENTS:PTADMITTEDAT1930INHbyremovingclonodinepatches.tore-evalanti-hypertens1261589.02138-04-172138-04-17 03:36:002138-04-17 04:24:00b26423d357dc3009f6e063c405ce742eMICUA7P-7ASHIFTSUMMARYOFEVENTS:PTADMITTEDATINHYPERbyremovingclonodinepatches.tore-evalanti-hypertens0.0139470.080.00.093947
\n", "
" ], "text/plain": [ " subject_id hadm_id icustay_id category chartdate \\\n", "0 68.0 17608.0 78.0 Nursing/other NaN \n", "1 109.0 164029.0 NaN Discharge summary 2142-12-10 \n", "2 109.0 26693.0 132.0 Nursing/other NaN \n", "3 109.0 12757.0 130.0 Nursing/other NaN \n", "4 109.0 14234.0 129.0 Nursing/other NaN \n", "\n", " charttime realtime length \\\n", "0 2728-12-13 16:51:00 2728-12-13 17:08:00 1666.0 \n", "1 NaN 2142-12-10 00:00:00 7751.0 \n", "2 3353-03-09 02:13:00 3353-03-09 02:47:00 2107.0 \n", "3 3352-12-19 16:48:00 3352-12-19 17:05:00 1294.0 \n", "4 3351-03-18 02:36:00 3351-03-18 03:24:00 3083.0 \n", "\n", " md5 \\\n", "0 27572b36bd4c26c322f50cf65d095d16 \n", "1 f5f69772c32f1b0ac05b7cf408f7a6db \n", "2 27d1f5907fa14b6702837a845f84c54e \n", "3 3e0fff775cfb678fdfa06ece68ebfab5 \n", "4 8efc0a2ff698b75ce183e3183c1bf204 \n", "\n", " str_start \\\n", "0 NSG0700-1900Events:Attemptedeatingbreakfastbutbeca \n", "1 :::Sex:FService:MEDICINEAllergies:PenicillinsAtten \n", "2 ficunsgnote:21:30-7:00thisisa22y.o.womanadmwithlup \n", "3 NPN7A-7PSeecarevueforspecifics:Briefly22yroldfemal \n", "4 MICUA7P-7ASHIFTSUMMARYOFEVENTS:PTADMITTEDAT1930INH \n", "\n", " str_end row_id_m3 chartdate_m3 \\\n", "0 ine..?doubleorsinglelumen.Continuetomonitoroutput. 1261174.0 2173-12-16 \n", "1 rativeassistant,,atifyouneedtoconfirmorreschedule. 15322.0 2140-01-21 \n", "2 needtocontinuehemodialysisscheduleqtues/thurs/sat. 1261605.0 2140-04-08 \n", "3 wnwhendialysiscompleted,LabetalolgttforSBP>180prn. 1261597.0 2140-01-19 \n", "4 byremovingclonodinepatches.tore-evalanti-hypertens 1261589.0 2138-04-17 \n", "\n", " charttime_m3 storetime_m3 md5_m3 \\\n", "0 2173-12-16 16:51:00 2173-12-16 17:08:00 a71a1009932ab1fce83b8867234ec9e4 \n", "1 NaN NaN 1770e41dbbe1a89a144bcebfad1fc87e \n", "2 2140-04-08 03:13:00 2140-04-08 03:47:00 6e88c96f2f3f7dab74d3732eec27dd35 \n", "3 2140-01-19 16:48:00 2140-01-19 17:05:00 d3350e7e8b6a9073feae2c0f11ea82e3 \n", "4 2138-04-17 03:36:00 2138-04-17 04:24:00 b26423d357dc3009f6e063c405ce742e \n", "\n", " str_start_m3 \\\n", "0 NSG0700-1900Events:Attemptedeatingbreakfastbutbeca \n", "1 :::Sex:FService:MEDICINEAllergies:PenicillinsAtten \n", "2 nsgnote:21:30-7:00thisisa22y.o.womanadmwithlupusdx \n", "3 NPN7A-7PSeecarevueforspecifics:Briefly22yroldfemal \n", "4 MICUA7P-7ASHIFTSUMMARYOFEVENTS:PTADMITTEDATINHYPER \n", "\n", " str_end_m3 len_diff_pct \\\n", "0 ine..?doubleorsinglelumen.Continuetomonitoroutput. 0.001200 \n", "1 rativeassistant,,atifyouneedtoconfirmorreschedule. 0.002580 \n", "2 needtocontinuehemodialysisscheduleqtues/thurs/sat. 0.031799 \n", "3 wnwhendialysiscompleted,LabetalolgttforSBP>180prn. 0.000773 \n", "4 byremovingclonodinepatches.tore-evalanti-hypertens 0.013947 \n", "\n", " str_start_diff str_end_diff total_m3_distance \n", "0 0.00 0.0 0.001200 \n", "1 0.00 0.0 0.002580 \n", "2 0.08 0.0 0.111799 \n", "3 0.00 0.0 0.000773 \n", "4 0.08 0.0 0.093947 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mimic3_map.head(5)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "disch_notes['md5'] = disch_notes['text'].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())\n", "nursing_notes['md5'] = nursing_notes['text'].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cols_to_keep = ['subject_id', 'category', 'md5', 'operator'] + categories\n", "comb_dat = pd.concat([disch_notes[cols_to_keep], nursing_notes[cols_to_keep]])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcategorymd5operatorAdvanced.CancerAdvanced.Heart.DiseaseAdvanced.Lung.DiseaseAlcohol.AbuseChronic.Neurological.DystrophiesChronic.Pain.FibromyalgiaDementiaDepressionDevelopmental.Delay.RetardationNon.AdherenceNoneObesityOther.Substance.AbuseSchizophrenia.and.other.Psychiatric.DisordersUnsure
09973Discharge56f2598342cce321539d8975809d487cJTW000011000000000
19973Discharge56f2598342cce321539d8975809d487cETM000011000000000
23365Dischargeeaea5c3c7577135a83f1f0fb583e0d53JTW000000000010000
33365Dischargeeaea5c3c7577135a83f1f0fb583e0d53ETM000000000010000
427290Dischargee7433c0b75ea00346390f029bb830774JW000000000010000
\n", "
" ], "text/plain": [ " subject_id category md5 operator \\\n", "0 9973 Discharge 56f2598342cce321539d8975809d487c JTW \n", "1 9973 Discharge 56f2598342cce321539d8975809d487c ETM \n", "2 3365 Discharge eaea5c3c7577135a83f1f0fb583e0d53 JTW \n", "3 3365 Discharge eaea5c3c7577135a83f1f0fb583e0d53 ETM \n", "4 27290 Discharge e7433c0b75ea00346390f029bb830774 JW \n", "\n", " Advanced.Cancer Advanced.Heart.Disease Advanced.Lung.Disease \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Alcohol.Abuse Chronic.Neurological.Dystrophies Chronic.Pain.Fibromyalgia \\\n", "0 0 1 1 \n", "1 0 1 1 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Dementia Depression Developmental.Delay.Retardation Non.Adherence None \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 1 \n", "3 0 0 0 0 1 \n", "4 0 0 0 0 1 \n", "\n", " Obesity Other.Substance.Abuse \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " Schizophrenia.and.other.Psychiatric.Disorders Unsure \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "comb_dat.head()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcategorymd5operatorAdvanced.CancerAdvanced.Heart.DiseaseAdvanced.Lung.DiseaseAlcohol.AbuseChronic.Neurological.DystrophiesChronic.Pain.FibromyalgiaDementiaDepressionDevelopmental.Delay.RetardationNon.AdherenceNoneObesityOther.Substance.AbuseSchizophrenia.and.other.Psychiatric.DisordersUnsure
199428068Nursing/Otherde1d6912eb489b0ab1cb35137c69e995ETM000000000010000
199528068Nursing/Otherde1d6912eb489b0ab1cb35137c69e995JTW000000000010000
199631369Nursing/Other2302d0a9ce57a278e84dbbb04f599089JF000000000010000
199731369Nursing/Other2302d0a9ce57a278e84dbbb04f599089JW100000000000001
199831369Nursing/Other2302d0a9ce57a278e84dbbb04f599089JF000000000010000
\n", "
" ], "text/plain": [ " subject_id category md5 operator \\\n", "1994 28068 Nursing/Other de1d6912eb489b0ab1cb35137c69e995 ETM \n", "1995 28068 Nursing/Other de1d6912eb489b0ab1cb35137c69e995 JTW \n", "1996 31369 Nursing/Other 2302d0a9ce57a278e84dbbb04f599089 JF \n", "1997 31369 Nursing/Other 2302d0a9ce57a278e84dbbb04f599089 JW \n", "1998 31369 Nursing/Other 2302d0a9ce57a278e84dbbb04f599089 JF \n", "\n", " Advanced.Cancer Advanced.Heart.Disease Advanced.Lung.Disease \\\n", "1994 0 0 0 \n", "1995 0 0 0 \n", "1996 0 0 0 \n", "1997 1 0 0 \n", "1998 0 0 0 \n", "\n", " Alcohol.Abuse Chronic.Neurological.Dystrophies \\\n", "1994 0 0 \n", "1995 0 0 \n", "1996 0 0 \n", "1997 0 0 \n", "1998 0 0 \n", "\n", " Chronic.Pain.Fibromyalgia Dementia Depression \\\n", "1994 0 0 0 \n", "1995 0 0 0 \n", "1996 0 0 0 \n", "1997 0 0 0 \n", "1998 0 0 0 \n", "\n", " Developmental.Delay.Retardation Non.Adherence None Obesity \\\n", "1994 0 0 1 0 \n", "1995 0 0 1 0 \n", "1996 0 0 1 0 \n", "1997 0 0 0 0 \n", "1998 0 0 1 0 \n", "\n", " Other.Substance.Abuse Schizophrenia.and.other.Psychiatric.Disorders \\\n", "1994 0 0 \n", "1995 0 0 \n", "1996 0 0 \n", "1997 0 0 \n", "1998 0 0 \n", "\n", " Unsure \n", "1994 0 \n", "1995 0 \n", "1996 0 \n", "1997 1 \n", "1998 0 " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "comb_dat.tail()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "comb_dat = pd.merge(comb_dat, mimic3_map[['md5', 'row_id_m3', 'total_m3_distance']], on='md5', how='left')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcategorymd5operatorAdvanced.CancerAdvanced.Heart.DiseaseAdvanced.Lung.DiseaseAlcohol.AbuseChronic.Neurological.DystrophiesChronic.Pain.FibromyalgiaDementiaDepressionDevelopmental.Delay.RetardationNon.AdherenceNoneObesityOther.Substance.AbuseSchizophrenia.and.other.Psychiatric.DisordersUnsurerow_id_m3total_m3_distance
09973Discharge56f2598342cce321539d8975809d487cJTW000011000000000NaNNaN
19973Discharge56f2598342cce321539d8975809d487cETM000011000000000NaNNaN
23365Dischargeeaea5c3c7577135a83f1f0fb583e0d53JTW00000000001000017170.00.000884
33365Dischargeeaea5c3c7577135a83f1f0fb583e0d53ETM00000000001000017170.00.000884
427290Dischargee7433c0b75ea00346390f029bb830774JW00000000001000050828.00.001554
\n", "
" ], "text/plain": [ " subject_id category md5 operator \\\n", "0 9973 Discharge 56f2598342cce321539d8975809d487c JTW \n", "1 9973 Discharge 56f2598342cce321539d8975809d487c ETM \n", "2 3365 Discharge eaea5c3c7577135a83f1f0fb583e0d53 JTW \n", "3 3365 Discharge eaea5c3c7577135a83f1f0fb583e0d53 ETM \n", "4 27290 Discharge e7433c0b75ea00346390f029bb830774 JW \n", "\n", " Advanced.Cancer Advanced.Heart.Disease Advanced.Lung.Disease \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Alcohol.Abuse Chronic.Neurological.Dystrophies Chronic.Pain.Fibromyalgia \\\n", "0 0 1 1 \n", "1 0 1 1 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Dementia Depression Developmental.Delay.Retardation Non.Adherence None \\\n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 1 \n", "3 0 0 0 0 1 \n", "4 0 0 0 0 1 \n", "\n", " Obesity Other.Substance.Abuse \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " Schizophrenia.and.other.Psychiatric.Disorders Unsure row_id_m3 \\\n", "0 0 0 NaN \n", "1 0 0 NaN \n", "2 0 0 17170.0 \n", "3 0 0 17170.0 \n", "4 0 0 50828.0 \n", "\n", " total_m3_distance \n", "0 NaN \n", "1 NaN \n", "2 0.000884 \n", "3 0.000884 \n", "4 0.001554 " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "comb_dat.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.6% of notes had no exact match in MIMIC 3\n" ] } ], "source": [ "grouped = comb_dat.groupby('md5')['total_m3_distance'].last()\n", "pct_unmatched = grouped.isnull().sum()/grouped.count() * 100\n", "print('{:3.2}% of notes had no exact match in MIMIC 3'.format(pct_unmatched))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-10-24 16:41:49,493 - root - INFO - ../../data/combined_label_data_2016-10-24-16-35.csv\n" ] } ], "source": [ "output_path = pl.Path(path_config['repo_data_dir']).joinpath('combined_label_data_{}.csv'.format(time_str))\n", "logger.info(output_path)\n", "comb_dat.to_csv(output_path.as_posix(), index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Gather patient's data from the database and export" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test functions for gathering data" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [], "source": [ "note_meta = sdu.get_note_metadata(conn, comb_dat.loc[2, 'row_id_m3'])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cgid:\tNone\n", "charttime:\tNone\n", "storetime:\tNone\n", "chartdate:\t2179-12-23 00:00:00\n", "subject_id:\t3365\n", "hadm_id:\t100103\n" ] } ], "source": [ "for (k, v) in note_meta.items():\n", " print(\"{}:\\t{}\".format(k, v))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dict_keys(['cgid', 'charttime', 'storetime', 'chartdate', 'subject_id', 'hadm_id'])" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "note_meta.keys()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [], "source": [ "diags = sdu.get_hadm_diagnoses(conn, note_meta['hadm_id'])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'clean_icd9_code': '396.3',\n", " 'hadm_id': 100103,\n", " 'icd9_code': '3963',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Mitral valve insufficiency and aortic valve insufficiency',\n", " 'seq_num': 1,\n", " 'short_title': 'Mitral/aortic val insuff',\n", " 'subject_id': 3365},\n", " {'clean_icd9_code': '398.91',\n", " 'hadm_id': 100103,\n", " 'icd9_code': '39891',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Rheumatic heart failure (congestive)',\n", " 'seq_num': 2,\n", " 'short_title': 'Rheumatic heart failure',\n", " 'subject_id': 3365},\n", " {'clean_icd9_code': '427.31',\n", " 'hadm_id': 100103,\n", " 'icd9_code': '42731',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Atrial fibrillation',\n", " 'seq_num': 3,\n", " 'short_title': 'Atrial fibrillation',\n", " 'subject_id': 3365}]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diags[:3]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['396.3',\n", " '398.91',\n", " '427.31',\n", " '414.01',\n", " '401.9',\n", " '272.4',\n", " '530.81',\n", " 'V10.11',\n", " 'V15.2']" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[d['clean_icd9_code'] for d in diags if d['known_icd9_code']]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parents:\n", "- ROOT: ROOT\n", "- 390-459: DISEASES OF THE CIRCULATORY SYSTEM \n", "- 393-398: CHRONIC RHEUMATIC HEART DISEASE \n", "- 396: Diseases of mitral and aortic valves\n", "- 396.3: Mitral valve insufficiency and aortic valve insufficiency\n", "\n", "-> 396.3: Mitral valve insufficiency and aortic valve insufficiency\n", "\n", "Children:\n" ] } ], "source": [ "sdu.print_icd9_tree(diags[0]['clean_icd9_code'])" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parents:\n", "- ROOT: ROOT\n", "- 240-279: ENDOCRINE, NUTRITIONAL AND METABOLIC DISEASES, AND IMMUNITY DISORDERS \n", "- 270-279: OTHER METABOLIC AND IMMUNITY DISORDERS \n", "- 272: Disorders of lipoid metabolism\n", "- 272.4: Other and unspecified hyperlipidemia\n", "\n", "-> 272.4: Other and unspecified hyperlipidemia\n", "\n", "Children:\n" ] } ], "source": [ "sdu.print_icd9_tree(diags[5]['clean_icd9_code'])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parents:\n", "- ROOT: ROOT\n", "- V10-V19: PERSONS WITH POTENTIAL HEALTH HAZARDS RELATED TO PERSONAL AND FAMILY HISTORY \n", "- V10: Personal history of malignant neoplasm\n", "- V10.1: Trachea, bronchus, and lung\n", "- V10.11: Bronchus and lung\n", "\n", "-> V10.11: Bronchus and lung\n", "\n", "Children:\n" ] } ], "source": [ "sdu.print_icd9_tree(diags[7]['clean_icd9_code'])" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parents:\n", "- ROOT: ROOT\n", "- V10-V19: PERSONS WITH POTENTIAL HEALTH HAZARDS RELATED TO PERSONAL AND FAMILY HISTORY \n", "- V15: Other personal history presenting hazards to health\n", "- V15.2: Surgery to other organs\n", "\n", "-> V15.2: Surgery to other organs\n", "\n", "Children:\n", "- V15.29: Surgery to other organs\n", "- V15.22: Personal history of undergoing in utero procedure while a fetus\n", "- V15.21: Personal history of undergoing in utero procedure during pregnancy\n" ] } ], "source": [ "sdu.print_icd9_tree(diags[8]['clean_icd9_code'])" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reload(sdu)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parents:\n", "- ROOT: ROOT\n", "- V10-V19: PERSONS WITH POTENTIAL HEALTH HAZARDS RELATED TO PERSONAL AND FAMILY HISTORY \n", "- V15: Other personal history presenting hazards to health\n", "- V15.2: Surgery to other organs\n", "\n", "-> V15.2: Surgery to other organs\n", "\n", "Children:\n", "- V15.29: Surgery to other organs\n", "- V15.22: Personal history of undergoing in utero procedure while a fetus\n", "- V15.21: Personal history of undergoing in utero procedure during pregnancy\n" ] } ], "source": [ "sdu.print_icd9_tree(diags[8]['clean_icd9_code'])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['V10-V19', 'V15', 'V15.2']" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sdu.get_icd9_levels(diags[8]['clean_icd9_code'], 5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Gather the data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reload any changes made to the structured data utils code" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reload(sdu)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use pandas to extract the list of unique notes and patients - the primary thing we're looking for is the MIMIC III row id, which is used to get the MIMIC encounter ID, and from there the ICD9 diagnoses." ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [], "source": [ "found_notes = comb_dat.loc[comb_dat['row_id_m3'].notnull()].\\\n", " groupby(['subject_id', 'md5', 'row_id_m3']).count()['total_m3_distance'].index.tolist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Iterate through the rows, building up a dictionary of dictionaries. `note_info` is a dictionary where the keys are the unique subject_id-md5-row_id triplet from the pandas line above. The values are another dictionary with 2 keys: \n", "\n", " * `meta` - note metadata, including the patient id (`subject_id`), encounter id (`hadm_id`), and associated timestamps\n", " * `diagnoses` - a list of the diagnoses associated with this encounter, including the original poorly formated ICD9 code from MIMIC, the reformated version (`clean_icd9_code`), and the label of the code" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [], "source": [ "note_info = {}\n", "for idx in found_notes:\n", " note_meta = sdu.get_note_metadata(conn, idx[2])\n", " note_diag = sdu.get_hadm_diagnoses(conn, note_meta['hadm_id'])\n", " dat = {'meta': note_meta, 'diagnoses': note_diag}\n", " note_info[idx] = dat" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Print one element out to see how it looks" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'diagnoses': [{'clean_icd9_code': '410.71',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '41071',\n", " 'known_icd9_code': False,\n", " 'long_title': 'Subendocardial infarction, initial episode of care',\n", " 'seq_num': 1,\n", " 'short_title': 'Subendo infarct, initial',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '398.91',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '39891',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Rheumatic heart failure (congestive)',\n", " 'seq_num': 2,\n", " 'short_title': 'Rheumatic heart failure',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '396.3',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '3963',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Mitral valve insufficiency and aortic valve insufficiency',\n", " 'seq_num': 3,\n", " 'short_title': 'Mitral/aortic val insuff',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '397.0',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '3970',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Diseases of tricuspid valve',\n", " 'seq_num': 4,\n", " 'short_title': 'Tricuspid valve disease',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '042',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '042',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Human immunodeficiency virus [HIV] disease',\n", " 'seq_num': 5,\n", " 'short_title': 'Human immuno virus dis',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '403.91',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '40391',\n", " 'known_icd9_code': False,\n", " 'long_title': 'Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage V or end stage renal disease',\n", " 'seq_num': 6,\n", " 'short_title': 'Hyp kid NOS w cr kid V',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '518.81',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '51881',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Acute respiratory failure',\n", " 'seq_num': 7,\n", " 'short_title': 'Acute respiratry failure',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '414.01',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '41401',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Coronary atherosclerosis of native coronary artery',\n", " 'seq_num': 8,\n", " 'short_title': 'Crnry athrscl natve vssl',\n", " 'subject_id': 11590},\n", " {'clean_icd9_code': '272.0',\n", " 'hadm_id': 172993,\n", " 'icd9_code': '2720',\n", " 'known_icd9_code': True,\n", " 'long_title': 'Pure hypercholesterolemia',\n", " 'seq_num': 9,\n", " 'short_title': 'Pure hypercholesterolem',\n", " 'subject_id': 11590}],\n", " 'meta': {'cgid': 17770,\n", " 'chartdate': datetime.datetime(2154, 6, 3, 0, 0),\n", " 'charttime': datetime.datetime(2154, 6, 3, 17, 30),\n", " 'hadm_id': 172993,\n", " 'storetime': datetime.datetime(2154, 6, 3, 17, 51),\n", " 'subject_id': 11590}}" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "note_info[[k for k in note_info.keys()][0]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can use this list and the ICD9 python library to extract all of the parents for each code. Not all codes in MIMIC III are know to the library (likely due to slightly different ICD versions), so we need to handle that possibility by just skipping unknown codes. If it's a know code then we look up the parents, and we'll add the code and each of its parents to the `note_codes` list. We'll also keep a list of the metadata." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "note_codes = []\n", "note_meta = []\n", "unknown_codes = set()\n", "for k, note_dat in note_info.items():\n", " subject_id, md5, row_id = k\n", "\n", " meta = note_dat['meta'].copy()\n", " meta['subject_id'] = subject_id\n", " meta['md5'] = md5\n", " meta['note_row_id'] = row_id\n", " note_meta.append(meta)\n", "\n", " diagnoses = note_dat['diagnoses']\n", " if diagnoses is not None:\n", " for diag in diagnoses:\n", " new_code = {\n", " 'subject_id': subject_id,\n", " 'md5': md5,\n", " 'note_row_id': row_id,\n", " 'level': 'source',\n", " 'code': diag['icd9_code']\n", " }\n", " note_codes.append(new_code)\n", "\n", " if diag['known_icd9_code']:\n", " levels = sdu.get_icd9_levels(diag['clean_icd9_code'])\n", " for ind, lev_code in enumerate(levels):\n", " new_code = {\n", " 'subject_id': subject_id,\n", " 'md5': md5,\n", " 'note_row_id': row_id,\n", " 'level': ind,\n", " 'code': lev_code\n", " }\n", " note_codes.append(new_code)\n", "\n", " else:\n", " if diag['icd9_code'] not in unknown_codes:\n", " unknown_codes.add(diag['icd9_code'])\n", " logger.info('Unknown code ({}) for subject ({})'.format(diag['icd9_code'], subject_id))" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "375" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(unknown_codes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inspecting the records, we see that for a particular note (row id 1414073), the code found a known ICD9 code (39891), then found a root parent (390-459), and the path from it through children 393-398, 398, ... We keep track of the hierarchy level from the root node - in a future post we'll use this info to select a cutoff depth for classification based on ICD9" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
codelevelmd5note_row_idsubject_id
041071sourcebe74552c73a0f9895c4f372763054d261414073.011590
139891sourcebe74552c73a0f9895c4f372763054d261414073.011590
2390-4590be74552c73a0f9895c4f372763054d261414073.011590
3393-3981be74552c73a0f9895c4f372763054d261414073.011590
43982be74552c73a0f9895c4f372763054d261414073.011590
\n", "
" ], "text/plain": [ " code level md5 note_row_id subject_id\n", "0 41071 source be74552c73a0f9895c4f372763054d26 1414073.0 11590\n", "1 39891 source be74552c73a0f9895c4f372763054d26 1414073.0 11590\n", "2 390-459 0 be74552c73a0f9895c4f372763054d26 1414073.0 11590\n", "3 393-398 1 be74552c73a0f9895c4f372763054d26 1414073.0 11590\n", "4 398 2 be74552c73a0f9895c4f372763054d26 1414073.0 11590" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "note_codes_df = pd.DataFrame.from_records(note_codes)\n", "note_codes_df.head(5)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-10-24 16:41:53,258 - root - INFO - ../../data/notes_icd9_codes_2016-10-24-16-35.csv\n" ] } ], "source": [ "output_path = pl.Path(path_config['repo_data_dir']).joinpath('notes_icd9_codes_{}.csv'.format(time_str))\n", "logger.info(output_path)\n", "note_codes_df.to_csv(output_path.as_posix(), index=False)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cgidchartdatecharttimehadm_idmd5note_row_idstoretimesubject_id
017770.02154-06-032154-06-03 17:30:00172993.0be74552c73a0f9895c4f372763054d261414073.02154-06-03 17:51:0011590
117698.02183-07-282183-07-28 05:41:00116105.02bd0c96855c6107be79d0150e1f121e71449706.02183-07-28 05:53:0014342
2NaN2170-02-13NaT122710.0bd4bf8040238e3e2cdd7466692defe7347105.0NaT8217
318469.02175-06-072175-06-07 05:39:00196691.06d20d9b6d3cfdc3fc9e8a72fbab0f6971573953.02175-06-07 06:27:0023829
417079.02125-04-272125-04-27 20:51:00133059.0d35003faa86241e60396014264b14a4d1264491.02125-04-27 21:03:00305
\n", "
" ], "text/plain": [ " cgid chartdate charttime hadm_id \\\n", "0 17770.0 2154-06-03 2154-06-03 17:30:00 172993.0 \n", "1 17698.0 2183-07-28 2183-07-28 05:41:00 116105.0 \n", "2 NaN 2170-02-13 NaT 122710.0 \n", "3 18469.0 2175-06-07 2175-06-07 05:39:00 196691.0 \n", "4 17079.0 2125-04-27 2125-04-27 20:51:00 133059.0 \n", "\n", " md5 note_row_id storetime \\\n", "0 be74552c73a0f9895c4f372763054d26 1414073.0 2154-06-03 17:51:00 \n", "1 2bd0c96855c6107be79d0150e1f121e7 1449706.0 2183-07-28 05:53:00 \n", "2 bd4bf8040238e3e2cdd7466692defe73 47105.0 NaT \n", "3 6d20d9b6d3cfdc3fc9e8a72fbab0f697 1573953.0 2175-06-07 06:27:00 \n", "4 d35003faa86241e60396014264b14a4d 1264491.0 2125-04-27 21:03:00 \n", "\n", " subject_id \n", "0 11590 \n", "1 14342 \n", "2 8217 \n", "3 23829 \n", "4 305 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "note_meta_df = pd.DataFrame.from_records(note_meta)\n", "note_meta_df.head(5)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2016-10-24 16:41:54,930 - root - INFO - ../../data/mimic3_note_metadata_2016-10-24-16-35.csv\n" ] } ], "source": [ "output_path = pl.Path(path_config['repo_data_dir']).joinpath('mimic3_note_metadata_{}.csv'.format(time_str))\n", "logger.info(output_path)\n", "note_meta_df.to_csv(output_path.as_posix(), index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }