This repository was archived by the owner on Aug 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparseGroundTruth.py
85 lines (67 loc) · 3.26 KB
/
parseGroundTruth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pandas as pd
from api_pkg.utils.tokenization import fromAnnotationToTokens, , addMissingText
ground_truth = sys.argv[1]
base_path = 'data/training_data/+'+ground_truth+'/'
train_path = base_path+'train/'
test_path = base_path+'test/'
csv_train_path = train_path + 'csv_ground_truth/'
csv_test_path = test_path + 'csv_ground_truth/'
text_train_path = train_path + 'text_files/'
text_test_path = test_path + 'text_files/'
raw_path = base_path+'raw/'
for item in [train_path,test_path,csv_train_path,csv_test_path,text_train_path,text_test_path]:
try:
os.makedirs(item)
except:
pass
print(raw_path)
if ground_truth == 'neel2015':
#'''
def parseNEEL(df_gt,id_text_dict,csv_path,text_path):
df_gt['id'] = df_gt['id'].astype(str)
#df_gt['start'] = df_gt['start'].astype(int)
df_gt['end'] = df_gt['end'].apply(lambda x:x-1)
ids = set(df_gt['id'])
for id_ in ids:
#id_ = str(id_)
print(id_)
df_id = df_gt[df_gt['id']==id_]
text = id_text_dict[id_]
def getCorrespondingText(row,text=text):
start,end = row['start'],row['end']+1
return text[start:end]
df_id['text'] = df_id.apply (lambda row: getCorrespondingText(row,text=text),axis=1)
db_uris = set([uri for uri in set(df_id['db_uri']) if type(uri)==str])
df_id = setWikidataUrisfromDbpedia_en(df_id,name_col='db_uri')
df_id = df_id.sort_values(by=['start'])
def setChars(annotations):
for i,row in enumerate(annotations):
start,end = row["start"],row["end"]
row['chars'] = range(start,end)
annotations[i] = row
return annotations
annotations = df_id.to_dict(orient='records')
df_id['chars'] = setChars(annotations)
annotations = addMissingText(annotations,text)
annotations = fromAnnotationToTokens(annotations)
with open(text_path+id_+'.txt', 'w') as outfile:
outfile.write(text)
pd.DataFrame(annotations).to_csv(csv_path+id_+'.csv',index=False)
#'''
#[confidence,continue,relevance,text,type,uri,wd_uri,db_uri,db_type]
train_path_raw = raw_path + 'NEEL2015-training-gold.tsv'
test_path_raw = raw_path + 'NEEL2015-test-gold.tsv'
train_path_raw_tweets = raw_path + 'NEEL2015-training-tweets.tsv'
test_path_raw_tweets = training_path = raw_path + 'NEEL2015-test-tweets.tsv'
df_train_raw = pd.read_csv(train_path_raw, sep='\t',header=None,names=['id','start','end','db_uri','type'])
df_test_raw = pd.read_csv(test_path_raw, sep='\t',header=None,names=['id','start','end','db_uri','type'])
def read_tsv_as_dict(path):
with open(path) as txt_file:
lines = txt_file.read().splitlines()
return {l.split('\t')[0]:l.split('\t')[1] for l in lines}
dict_train_raw_tweets = read_tsv_as_dict(train_path_raw_tweets)
dict_test_raw_tweets = read_tsv_as_dict(test_path_raw_tweets)
parseNEEL(df_train_raw,dict_train_raw_tweets,csv_train_path,text_train_path)
parseNEEL(df_test_raw,dict_test_raw_tweets,csv_test_path,text_test_path)
if ground_truth == 'oke2015':