-
Notifications
You must be signed in to change notification settings - Fork 1
/
classify.py
141 lines (116 loc) · 5.06 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import math
import metapy
import sys
import time
from tkinter import Tk, Label, Text, Button, LEFT, RIGHT, END
class SimpleGUI:
def __init__(self, master, classifier, fwd_idx):
self.master = master
master.title("EHR Diagnosis Tool")
self.label = Label(master, text="Use this tool to help predict your medical conditions.")
self.label.pack()
self.entry = Text(master)
self.entry.pack()
self.submit_button = Button(master, text="Diagnose!", command=self.parseInput)
self.submit_button.pack(side=LEFT)
self.response = Label(master, text="")
self.response.pack(side=RIGHT)
def parseInput(self):
user_input = self.entry.get("1.0", END)
# Ensure the user has entered something.
if len(user_input) >= 1:
pred = diagnose(classifier, fwd_idx, user_input)
response = get_response(pred)
self.response["text"] = response
def make_classifier(training, inv_idx, fwd_idx):
"""
This function wil define our classifier. I am using a OneVsAll as it
had the highest accuracy on my training set.
"""
return metapy.classify.OneVsAll(training, metapy.classify.SGD, loss_id='hinge')
def make_indxs(cfg):
"""
Inverted idx and forward idx are returned in that order.
"""
return metapy.index.make_inverted_index(cfg), metapy.index.make_forward_index(cfg)
def make_datasetview(fwd_idx):
"""
Insert the fwd_idx into a format that can be used by the classifier.
"""
dset = metapy.classify.MulticlassDataset(fwd_idx)
view = metapy.classify.MulticlassDatasetView(dset)
return dset, view
def test_classifier(dset, view, fwd_idx, inv_idx):
"""
This method was used to ensure a certain level of accuracy from the classifier.
"""
training = view[0:int(0.75*len(view))]
testing = view[int(0.75*len(view)):len(view)+1]
print('Running cross-validation...')
matrix = metapy.classify.cross_validate(lambda fold:
make_classifier(fold, inv_idx, fwd_idx), dset, 5)
matrix.print_stats()
def diagnose(classifier, fwd_idx, user_input):
doc = metapy.index.Document()
doc.content(user_input)
return classifier.classify(fwd_idx.tokenize(doc))
def get_response(prediction):
if prediction == "acidreflux":
return "You may have acid reflux.\nAntacids may help relieve some pain."
elif prediction == "als":
return "You may have ALS.\nSome drugs can slow the spread of ALS."
elif prediction == "alzheimer":
return "You may have Alzheimer's.\nThere are currently no drugs to treat Alzheimer's, but home care may help."
elif prediction == "breastcancer":
return "You may have breast cancer.\nConsult your doctor about chemotherapy."
elif prediction == "diabetes":
return "You may have diabetes.\nInsulin injections may help reduce the effect of diabetes."
elif prediction == "heartdisease":
return "You may have heart disease.\nConsult your doctor about medicine and making lifestyle changes."
elif prediction == "hemophilia":
return "You may have hemophilia.\nConsult your doctor about medication that can replace the clotting factor in your bloodstream."
elif prediction == "multiplesclerosis":
return "You may have multiple sclerosis.\nIt is important to get a formal diagnosis before starting any therapies."
elif prediction == "parkinsons":
return "You may have Parkinson's.\nPhysical therapy can help restore some indepedenence to your life."
else:
return "I'm sorry, something went wrong."
"""
I am using 9 diseases for diagnosis here:
Heart disease
Alzheimer's
Diabetes
Hemophilia
Breast cancer
Acid reflux
Parkinson's
Multiple Sclerosis
ALS
In order to construct the data, I pulled symptomatic profiles from a variety of online resources.
From there, I used some separate python scripts (found in ./diseases under splitup.py and gencorpus.py) to
arrange the data in a desired file format to be read by the file.toml. The original text samples can be
found under the name of their disease in ./diseases.
"""
if __name__ == '__main__':
start_time = time.time()
if len(sys.argv) != 2:
print("Usage: {} config.toml input.toml".format(sys.argv[0]))
sys.exit(1)
metapy.log_to_stderr()
cfg = sys.argv[1]
print('Building or loading indexes...')
# Build the data indices using the config.toml.
inv_idx, fwd_idx = make_indxs(cfg)
# Set up the MultiClassDatatsetView to be used for the classifier.
dset, view = make_datasetview(fwd_idx)
view.shuffle()
# Test the proposed classifier model to see if it upholds a standard of accuracy.
test_classifier(dset, view, fwd_idx, inv_idx)
# Construct the classifier.
classifier = make_classifier(view, inv_idx, fwd_idx)
# Now we construct the GUI in order to take user input and run it through the classifier.
print("Initialize the GUI")
root = Tk()
GUI = SimpleGUI(root, classifier, fwd_idx)
root.mainloop()
print("Elapsed: {} seconds".format(round(time.time() - start_time, 4)))