-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsynthesize_gp.py
122 lines (90 loc) · 4.13 KB
/
synthesize_gp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
import random
import time
import sys
import os
import pickle
import traceback
from pyshgp.gp.estimators import PushEstimator
from pyshgp.gp.genome import GeneSpawner
import datautils
import GP.gp_utils as gp_utils
# We use 200 example cases for the training set (including all hand-chosen inputs) and 2000 for the unseen test set. We recommend a budget of 60 million program executions;
# We allocate these to 200 training cases used to evaluate a population of 1000 individuals for 300
# generations in our experimental GP runs.
def loop_through_tasks(task_id_lists, base_save_folder, data_dir, metadata_file, num_runs, ga_params):
for t, taskid in enumerate(task_id_lists):
for r in range(num_runs):
save_folder = f"{base_save_folder}/{taskid}_{r}"
time.sleep(random.random()*5)
if not os.path.exists(save_folder):
os.makedirs(save_folder)
else:
continue
print("working on ")
print(save_folder)
print("loading data")
# Split the data into training and testing sets
X_train, y_train, X_test, y_test = datautils.generate_training_test_data(data_dir, taskid, r)
try:
scores = pd.DataFrame(columns = ['taskid','run','runtime','train_error','test_error','train_unsolved','test_unsolved'])
print("Starting the Evolution: ")
metadata = datautils.get_problem_metadata(metadata_file, taskid)
spawner = GeneSpawner(**metadata)
est = PushEstimator(
search="GA",
population_size=ga_params["population_size"],
max_generations=ga_params["max_generations"],
spawner=spawner,
simplification_steps=10,
last_str_from_stdout=True,
parallelism=True,
verbose=3
)
start = time.time()
est.fit(X=X_train, y=y_train)
end = time.time()
this_score = {}
this_score["runtime"] = end - start
assert len(est.solution.error_vector) == len(y_train)
this_score["train_total_error"] = np.sum(est.solution.error_vector)
# Record the number of unsolved cases in the training set
this_score["train_unsolved"] = np.sum(est.solution.error_vector > 0)
assert len(est.score(X_test, y_test)) == len(y_test)
this_score["test_total_error"] = np.sum(est.score(X_test, y_test))
# Record the number of unsolved cases in the training set
this_score["test_unsolved"] = np.sum(est.score(X_test, y_test) > 0)
this_score["taskid"] = taskid
this_score["run"] = r
scores.loc[len(scores.index)] = this_score
with open(f"{save_folder}/scores.pkl", "wb") as f:
pickle.dump(scores, f)
return
except Exception as e:
trace = traceback.format_exc()
pipeline_failure_dict = {"taskid": taskid, "error": str(e), "trace": trace}
print("failed on ")
print(save_folder)
print(e)
print(trace)
with open(f"{save_folder}/failed.pkl", "wb") as f:
pickle.dump(pipeline_failure_dict, f)
return
print("all finished")
if __name__ == "__main__":
task_id_lists = datautils.PSB2_DATASETS
base_save_folder = "results"
data_dir = "../Datasets/PSB2/datasets/"
metadata_file = "PSB2_metadata.csv"
num_runs_local = 5
ga_params_local = {
"population_size": 10,
"max_generations": 50
}
num_runs_hpc = 100
ga_params_hpc = {
"population_size": 1000,
"max_generations": 300
}
loop_through_tasks(task_id_lists, base_save_folder, data_dir, metadata_file, num_runs_local, ga_params_local)