-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_to_text_task2_train_dev_set.py
197 lines (154 loc) · 8.69 KB
/
convert_to_text_task2_train_dev_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os, sys, csv, re
import numpy as np
import pandas as pd
import string
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from sklearn.utils import shuffle
tokenizer = RegexpTokenizer(r'\w+')
cachedStopWords = []
with open("english_stopwords.txt", "r") as f:
cachedStopWords.extend([line.strip() for line in f.readlines()])
data_dir_1 = "./RDoC_raw_data/RDoCTask/RDoCTask2TrainData/Combined_Batch/"
data_output_dir_1 = "./datasets/Task2_without_acronym/"
RDoC_contruct_fnames = [fname.split(".")[0] for fname in os.listdir(data_dir_1)]
FLOAT_REGEXP = re.compile(r'^[-+]?([0-9]+|[0-9]*\.[0-9]+)([eE][-+]?[0-9]+)?$')
CAPS_REMOVE_LIST = ["APPROACH", "BACKGROUND", "COMPARISON", "CONCLUSION", "CONCLUSIONS", "FINDINGS", "IMPLICATIONS",
"INTRODUCTION", "LIMITATIONS", "MEASURES", "METHODOLOGY", "METHOD", "METHODS", "OBJECTIVE",
"OBJECTIVES", "OUTCOME", "PURPOSE", "RESULTS", "RESEARCH", "SIGNIFICANCE", "STATEMENT", "STUDY", "SUMMARY"]
def is_float(str):
return True if FLOAT_REGEXP.match(str) else False
def replace_num(tokens):
new_tokens = []
for token in tokens:
if is_float(token):
new_tokens.append("<num>")
else:
new_tokens.append(token)
return new_tokens
def preprocess_token(token):
return "".join([char for char in token if char.isalpha()])
def remove_punctuation_and_replace_num(doc):
doc_tokens = [token.strip(string.punctuation) for token in tokenizer.tokenize(doc)]
doc_tokens = [token for token in doc_tokens if not token.isupper()]
doc_tokens = replace_num(doc_tokens)
doc_tokens = [preprocess_token(token.lower()) for token in doc_tokens]
doc_tokens = [token for token in doc_tokens if not token in cachedStopWords]
#if len(doc_tokens) > 1:
# return " ".join(doc_tokens)
#else:
# return ''
return " ".join(doc_tokens)
if not os.path.exists(data_output_dir_1):
os.makedirs(data_output_dir_1)
total_docs = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
total_titles = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
total_labels = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
total_ids = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
total_relevant_context = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
for RDoC_name in RDoC_contruct_fnames:
xl = pd.read_excel(open(data_dir_1 + RDoC_name + ".xlsx", 'rb'))
total_ids[RDoC_name].extend(xl['pmid'].tolist())
total_titles[RDoC_name].extend(xl['title'].tolist())
total_docs[RDoC_name].extend(xl['abstract'].tolist())
total_relevant_context[RDoC_name].extend(xl['Relevant Context'].tolist())
total_labels[RDoC_name].extend([RDoC_name] * len(total_docs[RDoC_name]))
for RDoC_name in RDoC_contruct_fnames:
temp_relevance_context = []
for rel_con in total_relevant_context[RDoC_name]:
temp_rel_con = [sent.strip("'") for sent in rel_con[1:-1].split("', '")]
temp_relevance_context.append("\t".join(temp_rel_con))
total_relevant_context[RDoC_name] = temp_relevance_context
temp_docs = []
for doc in total_docs[RDoC_name]:
temp_doc_sents = nltk.sent_tokenize(doc.strip())
temp_docs.append("\t".join(temp_doc_sents))
total_docs[RDoC_name] = temp_docs
with open(data_dir_1 + "/total_docs_sents.txt", "w") as f:
for RDoC_name in RDoC_contruct_fnames:
f.write("======================================================================\n")
f.write(RDoC_name + "\n")
f.write("======================================================================\n")
for doc, id in zip(total_docs[RDoC_name], total_ids[RDoC_name]):
f.write(str(id) + "\n\n")
f.write("\n".join(doc.split("\t")))
f.write("\n\n")
with open(data_dir_1 + "/total_docs_relevant_contexts.txt", "w") as f:
for RDoC_name in RDoC_contruct_fnames:
f.write("======================================================================\n")
f.write(RDoC_name + "\n")
f.write("======================================================================\n")
for rel_con, id in zip(total_relevant_context[RDoC_name], total_ids[RDoC_name]):
f.write(str(id) + "\n\n")
f.write("\n".join(rel_con.split("\t")))
f.write("\n\n")
train_docs = []
train_titles = []
train_labels = []
train_ids = []
train_relevant_context = []
val_docs = []
val_titles = []
val_labels = []
val_ids = []
val_relevant_context = []
for RDoC_name in RDoC_contruct_fnames:
docs_train, docs_val, labels_train, labels_val, titles_train, titles_val, ids_train, ids_val, rc_train, rc_val \
= train_test_split(total_docs[RDoC_name], total_labels[RDoC_name], total_titles[RDoC_name], total_ids[RDoC_name], total_relevant_context[RDoC_name], test_size=0.2, random_state=123)
train_docs.extend(docs_train)
train_titles.extend(titles_train)
train_labels.extend(labels_train)
train_ids.extend(ids_train)
train_relevant_context.extend(rc_train)
val_docs.extend(docs_val)
val_titles.extend(titles_val)
val_labels.extend(labels_val)
val_ids.extend(ids_val)
val_relevant_context.extend(rc_val)
with open(data_dir_1 + "/train_docs.txt", "w") as f:
for doc, title, id, label, rel_con in zip(train_docs, train_titles, train_ids, train_labels, train_relevant_context):
doc_String = str(id) + "<<>>" + label + "<<>>" + title + "<<>>" + doc + "<<>>" + rel_con
f.write(doc_String + "\n")
with open(data_dir_1 + "/val_docs.txt", "w") as f:
for doc, title, id, label, rel_con in zip(val_docs, val_titles, val_ids, val_labels, val_relevant_context):
doc_String = str(id) + "<<>>" + label + "<<>>" + title + "<<>>" + doc + "<<>>" + rel_con
f.write(doc_String + "\n")
with open(data_output_dir_1 + "train_docs.txt", "w") as f:
for id, label, title, doc, rel_con in zip(train_ids, train_labels, train_titles, train_docs, train_relevant_context):
doc = "\t".join([remove_punctuation_and_replace_num(sent) for sent in doc.split("\t") if remove_punctuation_and_replace_num(sent)])
rel_con = "\t".join([remove_punctuation_and_replace_num(sent) for sent in rel_con.split("\t") if remove_punctuation_and_replace_num(sent)])
title = remove_punctuation_and_replace_num(title)
f.write(str(id) + "<<>>" + label + "<<>>" + title + "<<>>" + doc + "<<>>" + rel_con + "\n")
with open(data_output_dir_1 + "val_docs.txt", "w") as f:
for id, label, title, doc, rel_con in zip(val_ids, val_labels, val_titles, val_docs, val_relevant_context):
doc = "\t".join([remove_punctuation_and_replace_num(sent) for sent in doc.split("\t") if remove_punctuation_and_replace_num(sent)])
rel_con = "\t".join([remove_punctuation_and_replace_num(sent) for sent in rel_con.split("\t") if remove_punctuation_and_replace_num(sent)])
title = remove_punctuation_and_replace_num(title)
f.write(str(id) + "<<>>" + label + "<<>>" + title + "<<>>" + doc + "<<>>" + rel_con + "\n")
with open(data_output_dir_1 + "test_docs.txt", "w") as f:
for id, label, title, doc, rel_con in zip(val_ids, val_labels, val_titles, val_docs, val_relevant_context):
doc = "\t".join([remove_punctuation_and_replace_num(sent) for sent in doc.split("\t") if remove_punctuation_and_replace_num(sent)])
rel_con = "\t".join([remove_punctuation_and_replace_num(sent) for sent in rel_con.split("\t") if remove_punctuation_and_replace_num(sent)])
title = remove_punctuation_and_replace_num(title)
f.write(str(id) + "<<>>" + label + "<<>>" + title + "<<>>" + doc + "<<>>" + rel_con + "\n")
"""
train_docs = [remove_punctuation_and_replace_num(doc) for doc in train_docs]
val_docs = [remove_punctuation_and_replace_num(doc) for doc in val_docs]
train_titles = [remove_punctuation_and_replace_num(doc) for doc in train_titles]
val_titles = [remove_punctuation_and_replace_num(doc) for doc in val_titles]
#train_docs, train_titles, train_labels = shuffle(train_docs, train_titles, train_labels, random_state=123)
#val_docs, val_titles, val_labels = shuffle(val_docs, val_titles, val_labels, random_state=123)
with open(data_output_dir_1 + "train_docs.txt", "w") as f:
f.write("\n".join([label + "\t" + title + " " + doc for doc, title, label in zip(train_docs, train_titles, train_labels)]))
with open(data_output_dir_1 + "train_ids.txt", "w") as f:
f.write("\n".join([str(id) for id in train_ids]))
with open(data_output_dir_1 + "val_docs.txt", "w") as f:
f.write("\n".join([label + "\t" + title + " " + doc for doc, title, label in zip(val_docs, val_titles, val_labels)]))
with open(data_output_dir_1 + "val_ids.txt", "w") as f:
f.write("\n".join([str(id) for id in val_ids]))
with open(data_output_dir_1 + "test_docs.txt", "w") as f:
f.write("\n".join([label + "\t" + title + " " + doc for doc, title, label in zip(val_docs, val_titles, val_labels)]))
with open(data_output_dir_1 + "test_ids.txt", "w") as f:
f.write("\n".join([str(id) for id in val_ids]))
"""