-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_to_text_task2_test_set.py
114 lines (89 loc) · 4.24 KB
/
convert_to_text_task2_test_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os, sys, csv, re
import numpy as np
import pandas as pd
import string
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from sklearn.utils import shuffle
tokenizer = RegexpTokenizer(r'\w+')
cachedStopWords = []
with open("english_stopwords.txt", "r") as f:
cachedStopWords.extend([line.strip() for line in f.readlines()])
data_dir_1 = "./RDoC_raw_data/RDoCTask/RDoCTask2TestData/Combined_batch/"
data_output_dir_1 = "./datasets/Task2_test_data_combined_batch_without_acronym/"
RDoC_contruct_fnames = [fname.split(".")[0] for fname in os.listdir(data_dir_1)]
FLOAT_REGEXP = re.compile(r'^[-+]?([0-9]+|[0-9]*\.[0-9]+)([eE][-+]?[0-9]+)?$')
CAPS_REMOVE_LIST = ["APPROACH", "BACKGROUND", "COMPARISON", "CONCLUSION", "CONCLUSIONS", "FINDINGS", "IMPLICATIONS",
"INTRODUCTION", "LIMITATIONS", "MEASURES", "METHODOLOGY", "METHOD", "METHODS", "OBJECTIVE",
"OBJECTIVES", "OUTCOME", "PURPOSE", "RESULTS", "RESEARCH", "SIGNIFICANCE", "STATEMENT", "STUDY", "SUMMARY"]
def is_float(str):
return True if FLOAT_REGEXP.match(str) else False
def replace_num(tokens):
new_tokens = []
for token in tokens:
if is_float(token):
new_tokens.append("<num>")
else:
new_tokens.append(token)
return new_tokens
def preprocess_token(token):
return "".join([char for char in token if char.isalpha()])
def remove_punctuation_and_replace_num(doc):
doc_tokens = [token.strip(string.punctuation) for token in tokenizer.tokenize(doc)]
doc_tokens = [token for token in doc_tokens if not token.isupper()]
doc_tokens = replace_num(doc_tokens)
doc_tokens = [preprocess_token(token.lower()) for token in doc_tokens]
doc_tokens = [token for token in doc_tokens if not token in cachedStopWords]
#if len(doc_tokens) > 1:
# return " ".join(doc_tokens)
#else:
# return ''
if len(doc_tokens) == 1:
print(doc_tokens)
return " ".join(doc_tokens)
if not os.path.exists(data_output_dir_1):
os.makedirs(data_output_dir_1)
total_docs = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
total_titles = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
total_labels = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
total_ids = {RDoC_name:[] for RDoC_name in RDoC_contruct_fnames}
for RDoC_name in RDoC_contruct_fnames:
xl = pd.read_excel(open(data_dir_1 + RDoC_name + ".xlsx", 'rb'))
total_ids[RDoC_name].extend(xl['pmid'].tolist())
total_titles[RDoC_name].extend(xl['title'].tolist())
total_docs[RDoC_name].extend(xl['abstract'].tolist())
total_labels[RDoC_name].extend([RDoC_name] * len(total_docs[RDoC_name]))
for RDoC_name in RDoC_contruct_fnames:
temp_docs = []
for doc in total_docs[RDoC_name]:
temp_doc_sents = nltk.sent_tokenize(doc.strip())
temp_docs.append("\t".join(temp_doc_sents))
total_docs[RDoC_name] = temp_docs
with open(data_dir_1 + "/test_docs_sents.txt", "w") as f:
for RDoC_name in RDoC_contruct_fnames:
f.write("======================================================================\n")
f.write(RDoC_name + "\n")
f.write("======================================================================\n")
for doc, id in zip(total_docs[RDoC_name], total_ids[RDoC_name]):
f.write(str(id) + "\n\n")
f.write("\n".join(doc.split("\t")))
f.write("\n\n")
test_docs = []
test_titles = []
test_labels = []
test_ids = []
for RDoC_name in RDoC_contruct_fnames:
test_docs.extend(total_docs[RDoC_name])
test_titles.extend(total_titles[RDoC_name])
test_labels.extend(total_labels[RDoC_name])
test_ids.extend(total_ids[RDoC_name])
with open(data_dir_1 + "/test_docs.txt", "w") as f:
for doc, title, id, label in zip(test_docs, test_titles, test_ids, test_labels):
doc_String = str(id) + "<<>>" + label + "<<>>" + title + "<<>>" + doc + "<<>>" + "dummy"
f.write(doc_String + "\n")
with open(data_output_dir_1 + "test_docs.txt", "w") as f:
for id, label, title, doc in zip(test_ids, test_labels, test_titles, test_docs):
doc = "\t".join([remove_punctuation_and_replace_num(sent) for sent in doc.split("\t") if remove_punctuation_and_replace_num(sent)])
title = remove_punctuation_and_replace_num(title)
f.write(str(id) + "<<>>" + label + "<<>>" + title + "<<>>" + doc + "<<>>" + "dummy" + "\n")