-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
99 lines (67 loc) · 2.94 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import ast
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
def load_train_df(file_path='train_gold.csv'):
train_df_raw = pd.read_csv(file_path)
train_dict = {'nc': [], 'paraphrase': []}
for index, row in train_df_raw.iterrows():
nc = row['w1'] + " " + row['w2']
train_dict['nc'].append(nc)
train_dict['paraphrase'].append(row['paraphrase'])
train_df = pd.DataFrame.from_dict(train_dict)
return train_df
def load_saved_test_df(file_path='test_df.csv'):
df = pd.read_csv(file_path, usecols=['nc', 'paraphrases'])
for i, row in df.iterrows():
string_literal = df.at[i, 'paraphrases']
df.at[i, 'paraphrases'] = ast.literal_eval(string_literal)
return df
def load_difficult_set(file_path='difficult_set.csv'):
diff_df_raw = pd.read_csv(file_path)
diff_dict = {'nc': [], 'paraphrases': []}
for index, row in diff_df_raw.iterrows():
nc = row['nc']
diff_dict['nc'].append(nc)
diff_dict['paraphrases'].append([row['para']])
diff_df = pd.DataFrame.from_dict(diff_dict)
return diff_df
def split_train_df_to_dataset(df, split=0.2):
train_df, valid_df = train_test_split(df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
return train_dataset, valid_dataset
# Old Methods
def load_valid_dataset(file_path='valid_ds.csv'):
valid_df_raw = pd.read_csv(file_path)
dict_for_df = {'nc': [], 'paraphrases': []}
for index, row in valid_df_raw.iterrows():
dict_for_df['nc'].append(row['nc'])
dict_for_df['paraphrases'].append(row['paraphrases'])
df = pd.DataFrame.from_dict(dict_for_df)
dataset = Dataset.from_pandas(df)
return dataset
def load_test_valid_dataset(file_path='test_gold.csv'):
test_df_raw = pd.read_csv(file_path)
test_dict = {}
for index, row in test_df_raw.iterrows():
nc = row['w1'] + " " + row['w2']
if nc in test_dict.keys():
test_dict[nc].append(row['paraphrase'])
else:
test_dict[nc] = [row['paraphrase']]
test_dict_for_df = {'nc': [], 'paraphrases': []}
for noun_compound, paras in test_dict.items():
test_dict_for_df['nc'].append(noun_compound)
test_dict_for_df['paraphrases'].append(paras)
test_df = pd.DataFrame.from_dict(test_dict_for_df)
test_df, valid_df = train_test_split(test_df, test_size=0.2)
test_dataset = Dataset.from_pandas(test_df)
valid_dataset = Dataset.from_pandas(valid_df)
return test_dataset, valid_dataset
def load_test_valid_df(file_path):
df = pd.read_csv(file_path, usecols=['nc','paraphrases'])
for i, row in df.iterrows():
string_literal = df.at[i, 'paraphrases']
df.at[i, 'paraphrases'] = ast.literal_eval(string_literal)
return df