Skip to content

Commit 269fd94

Browse files
committed
upload codes
1 parent 93464ec commit 269fd94

File tree

8 files changed

+1453
-1
lines changed

8 files changed

+1453
-1
lines changed

README.md

+50-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,50 @@
1-
# awesome
1+
# DCAN
2+
3+
Dilated Convolutional Attention Network (DCAN), integrating dilated convolutions, residual connections, and label attention, for medical code assignment. It adopts dilated convolutions to capture complex medical patterns with a receptive field which increases exponentially with dilation size.
4+
5+
## Data
6+
Download MIMIC-III dataset from [physionet](https://mimic.physionet.org).
7+
8+
Organize your data using the following structure
9+
10+
```
11+
data
12+
| D_ICD_DIAGNOSES.csv
13+
| D_ICD_PROCEDURES.csv
14+
| ICD9_descriptions
15+
└───mimic3/
16+
| | NOTEEVENTS.csv
17+
| | DIAGNOSES_ICD.csv
18+
| | PROCEDURES_ICD.csv
19+
| | *_hadm_ids.csv
20+
```
21+
22+
23+
`ICD9_descriptions` is avaiable [in this repo](https://github.com/jamesmullenbach/caml-mimic/blob/master/mimicdata/ICD9_descriptions), and
24+
`*_hadm_ids.csv` are avaiable [here](https://github.com/jamesmullenbach/caml-mimic/tree/master/mimicdata/mimic3).
25+
`MIMIC_RAW_DSUMS` is available [here](https://physionet.org/works/ICD9CodingofDischargeSummaries/), while the rest file for MIMIC2 can be generated with their code.
26+
If you use Python3 `consctruct_datasest.py` in `ICD9_Coding_of_Discharge_Summaries` to create data files, remember to convert dict object to list (line 82&83) and use `dict.items()` instead of `dict.iteritems()`.
27+
Assign the directories of MIMIC data using `MIMIC_3_DIR`.
28+
29+
## Run
30+
``python3 main.py``
31+
32+
Configs available at `options.py`.
33+
34+
Requirements:
35+
- python 3.7
36+
- pytorch 1.5.0
37+
38+
## Citation
39+
```
40+
@inproceedings{ji2020dilated,
41+
title={Dilated Convolutional Attention Network for Medical Code Assignment from Clinical Text},
42+
author={Ji, Shaoxiong and Cambria, Erik and Marttinen, Pekka},
43+
booktitle={3rd Clinical Natural Language Processing Workshop at EMNLP},
44+
year={2020}
45+
}
46+
```
47+
48+
## References
49+
- https://github.com/jamesmullenbach/caml-mimic
50+
- https://github.com/foxlf823/Multi-Filter-Residual-Convolutional-Neural-Network

dataloader.py

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import csv
2+
import torch
3+
import numpy as np
4+
from collections import defaultdict
5+
from transformers import AutoTokenizer
6+
from torch.utils.data import Dataset
7+
from elmo import elmo
8+
9+
10+
def load_vocab_dict(args, vocab_file):
11+
"""
12+
Load vocabulary dictionary from file: vocab_file
13+
"""
14+
vocab = set()
15+
with open(vocab_file, 'r') as vocabfile:
16+
for i, line in enumerate(vocabfile):
17+
line = line.rstrip()
18+
if line != '':
19+
vocab.add(line.strip())
20+
ind2w = {i + 1: w for i, w in enumerate(sorted(vocab))}
21+
w2ind = {w: i for i, w in ind2w.items()}
22+
return ind2w, w2ind
23+
24+
25+
def load_full_codes(train_path, mimic2_dir, version='mimic3'):
26+
"""
27+
Load full set of ICD codes
28+
"""
29+
if version == 'mimic2':
30+
ind2c = defaultdict(str)
31+
codes = set()
32+
with open(mimic2_dir, 'r') as f:
33+
r = csv.reader(f)
34+
next(r) # skip header
35+
for row in r:
36+
codes.update(set(row[-1].split(';')))
37+
codes = set([c for c in codes if c != ''])
38+
ind2c = defaultdict(str, {i:c for i,c in enumerate(sorted(codes))})
39+
else:
40+
codes = set()
41+
for split in ['train', 'dev', 'test']:
42+
with open(train_path.replace('train', split), 'r') as f:
43+
lr = csv.reader(f)
44+
next(lr)
45+
for row in lr:
46+
for code in row[3].split(';'):
47+
codes.add(code)
48+
codes = set([c for c in codes if c != ''])
49+
ind2c = defaultdict(str, {i:c for i,c in enumerate(sorted(codes))})
50+
return ind2c
51+
52+
53+
def load_lookups(args):
54+
"""
55+
Load lookup dictionaries: index2word, word2index, index2code, code2index
56+
"""
57+
ind2w, w2ind = load_vocab_dict(args, args.vocab)
58+
if args.Y == 'full':
59+
ind2c = load_full_codes(args.data_path, '%s/proc_dsums.csv' % args.MIMIC_2_DIR, version=args.version)
60+
else:
61+
codes = set()
62+
with open("%s/TOP_%s_CODES.csv" % (args.MIMIC_3_DIR, str(args.Y)), 'r') as labelfile:
63+
lr = csv.reader(labelfile)
64+
for i, row in enumerate(lr):
65+
codes.add(row[0])
66+
ind2c = {i:c for i,c in enumerate(sorted(codes))}
67+
c2ind = {c:i for i,c in ind2c.items()}
68+
dicts = {'ind2w': ind2w, 'w2ind': w2ind, 'ind2c': ind2c, 'c2ind': c2ind}
69+
return dicts
70+
71+
72+
def prepare_instance(dicts, filename, args, max_length):
73+
# filename: data/mimic[2/3]/[train/dev/test]_[50/full].csv, e.g., data/mimic3/train_50.csv
74+
ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
75+
instances = []
76+
num_labels = len(dicts['ind2c'])
77+
with open(filename, 'r') as infile:
78+
r = csv.reader(infile)
79+
next(r) # skip header
80+
for row in r:
81+
text = row[2]
82+
labels_idx = np.zeros(num_labels)
83+
labelled = False
84+
for l in row[3].split(';'):
85+
if l in c2ind.keys():
86+
code = int(c2ind[l])
87+
labels_idx[code] = 1
88+
labelled = True
89+
if not labelled:
90+
continue
91+
tokens_ = text.split()
92+
tokens = []
93+
tokens_id = []
94+
for token in tokens_:
95+
if token == '[CLS]' or token == '[SEP]':
96+
continue
97+
tokens.append(token)
98+
token_id = w2ind[token] if token in w2ind else len(w2ind) + 1
99+
tokens_id.append(token_id)
100+
if len(tokens) > max_length:
101+
tokens = tokens[:max_length]
102+
tokens_id = tokens_id[:max_length]
103+
dict_instance = {'label': labels_idx, 'tokens': tokens, "tokens_id": tokens_id}
104+
instances.append(dict_instance)
105+
return instances
106+
107+
108+
def prepare_instance_bert(dicts, filename, args, max_length):
109+
ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
110+
instances = []
111+
num_labels = len(dicts['ind2c'])
112+
wp_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
113+
with open(filename, 'r') as infile:
114+
r = csv.reader(infile)
115+
next(r)
116+
for row in r:
117+
text = row[2]
118+
labels_idx = np.zeros(num_labels)
119+
labelled = False
120+
for l in row[3].split(';'):
121+
if l in c2ind.keys():
122+
code = int(c2ind[l])
123+
labels_idx[code] = 1
124+
labelled = True
125+
if not labelled:
126+
continue
127+
tokens_ = text.split()
128+
tokens = []
129+
for token in tokens_:
130+
if token == '[CLS]' or token == '[SEP]':
131+
continue
132+
wps = wp_tokenizer.tokenize(token)
133+
tokens.extend(wps)
134+
tokens_max_len = max_length-2 # for CLS SEP
135+
if len(tokens) > tokens_max_len:
136+
tokens = tokens[:tokens_max_len]
137+
tokens.insert(0, '[CLS]')
138+
tokens.append('[SEP]')
139+
tokens_id = wp_tokenizer.convert_tokens_to_ids(tokens)
140+
masks = [1] * len(tokens)
141+
segments = [0] * len(tokens)
142+
dict_instance = {'label':labels_idx, 'tokens':tokens, "tokens_id":tokens_id,
143+
"segments":segments, "masks":masks}
144+
instances.append(dict_instance)
145+
return instances
146+
147+
148+
class MyDataset(Dataset):
149+
def __init__(self, X):
150+
self.X = X
151+
152+
def __len__(self):
153+
return len(self.X)
154+
155+
def __getitem__(self, idx):
156+
return self.X[idx]
157+
158+
159+
def pad_sequence(x, max_len, type=np.int):
160+
padded_x = np.zeros((len(x), max_len), dtype=type)
161+
for i, row in enumerate(x):
162+
padded_x[i][:len(row)] = row
163+
return padded_x
164+
165+
166+
def my_collate(x):
167+
words = [x_['tokens_id'] for x_ in x]
168+
seq_len = [len(w) for w in words]
169+
masks = [[1]*len(w) for w in words]
170+
max_seq_len = max(seq_len) # TODO
171+
# max_seq_len = args.MAX_LENGTH # TODO for capsule network
172+
173+
inputs_idx = torch.LongTensor(pad_sequence(words, max_seq_len))
174+
inputs_mask = torch.LongTensor(pad_sequence(masks, max_seq_len))
175+
labels = torch.FloatTensor([x_['label'] for x_ in x])
176+
inputs_text = [x_['tokens'] for x_ in x]
177+
inputs_text = elmo.batch_to_ids(inputs_text)
178+
return inputs_idx, labels, inputs_text, inputs_mask
179+
180+
181+
def my_collate_bert(x):
182+
words = [x_['tokens_id'] for x_ in x]
183+
segments = [x_['segments'] for x_ in x]
184+
masks = [x_['masks'] for x_ in x]
185+
seq_len = [len(w) for w in words]
186+
max_seq_len = max(seq_len) # max of batch
187+
188+
inputs_idx = torch.LongTensor(pad_sequence(words, max_seq_len))
189+
segments = torch.LongTensor(pad_sequence(segments, max_seq_len))
190+
masks = torch.LongTensor(pad_sequence(masks, max_seq_len))
191+
labels = torch.FloatTensor([x_['label'] for x_ in x])
192+
return inputs_idx, segments, masks, labels
193+

0 commit comments

Comments
 (0)