-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining_helpers.py
119 lines (93 loc) · 4.05 KB
/
training_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
import os
LATEST_FV = 7
LATEST_TD = 2
DATASET = 12326
TO_USE = ["route_type", "mean_arrival_delay_seconds_lag_5stops", "stop_count", "temperature_2m", "snowfall", "snow_depth", "wind_gusts_10m", "hour"]
#TO_USE = ["route_type", "stop_count", "temperature_2m", "apparent_temperature", "precipitation", "rain", "snowfall", "snow_depth", "cloud_cover", "wind_speed_10m", "wind_speed_100m", "wind_gusts_10m", "hour"]
TO_PREDICT = ["mean_arrival_delay_seconds", "mean_on_time_percent"]
#TO_PREDICT = ["mean_on_time_percent"]
_split_cycle_2 = [
"train", "train", "train", "test",
"train", "train", "train", "train",
"train", "train", "train", "test",
"train", "train", "train", "test",
"train", "train", "train", "train"
]
_split_cycle_3 = [
"train", "train", "validate", "test",
"train", "train", "train", "test",
"train", "train", "train", "validate",
"train", "train", "train", "test",
"train", "train", "train", "validate"
]
_route_types = [100, 101, 102, 103, 105, 106, 401, 700, 714, 900, 1000, 1501]
def strip_dates(df):
return df.drop(["date", "arrival_time_bin"], axis=1)
def one_hot(df):
if "route_type" in df:
encoded = pd.get_dummies(df["route_type"], prefix="route_type", dtype="int64")
for rt in _route_types:
if f"route_type_{rt}" not in encoded.columns:
encoded[f"route_type_{rt}"] = 0
df = pd.concat([df, encoded], axis=1).drop(columns=["route_type"])
return df
def train_test_split_cyclical(df):
total_length = len(df)
assignments = pd.Series([_split_cycle_2[i % len(_split_cycle_2)] for i in range(total_length)])
train = df.loc[assignments == "train"]
test = df.loc[assignments == "test"]
return train, test
def train_test_split_time(df, test_start):
train = df.loc[df["arrival_time_bin"] < test_start]
test = df.loc[df["arrival_time_bin"] >= test_start]
return train, test
# Splits into train, test and validate sets.
def train_validate_test_split(df):
total_length = len(df)
assignments = pd.Series([_split_cycle_3[i % len(_split_cycle_3)] for i in range(total_length)])
train = df.loc[assignments == "train"]
validate = df.loc[assignments == "validate"]
test = df.loc[assignments == "test"]
return train, validate, test
def load_dataset(strip=True):
dataset_dir = "training_datasets"
x_name = os.path.join(dataset_dir, f"features_{DATASET}.pickle")
y_name = os.path.join(dataset_dir, f"labels_{DATASET}.pickle")
if strip:
x_all = strip_dates(pd.read_pickle(x_name))
else:
x_all = pd.read_pickle(x_name)
x_all = one_hot(x_all[TO_USE if strip else TO_USE + ["arrival_time_bin", "date"]])
y_all = pd.read_pickle(y_name)[TO_PREDICT]
return x_all, y_all
def load_xy_time(stamp):
x_all, y_all = load_dataset(strip=False)
y_all["arrival_time_bin"] = x_all["arrival_time_bin"]
x_train, x_test = train_test_split_time(x_all, stamp)
y_train, y_test = train_test_split_time(y_all, stamp)
y_train.drop("arrival_time_bin", axis=1, inplace=True)
y_test.drop("arrival_time_bin", axis=1, inplace=True)
return strip_dates(x_train), y_train, strip_dates(x_test), y_test
def load_xy_cyclical():
x_all, y_all = load_dataset(strip=True)
x_train, x_test = train_test_split_cyclical(x_all)
y_train, y_test = train_test_split_cyclical(y_all)
return x_train, y_train, x_test, y_test
def save_dataset(x_all, y_all):
dataset_dir = "training_datasets"
number_of_rows = x_all.shape[0]
x_name = os.path.join(dataset_dir, f"features_{number_of_rows}.pickle")
y_name = os.path.join(dataset_dir, f"labels_{number_of_rows}.pickle")
x_all.to_pickle(x_name)
y_all.to_pickle(y_name)
def get_model_dir(name):
model_dir = "best_models"
if not os.path.exists(model_dir):
os.mkdir(model_dir)
return os.path.join(model_dir, name)
if __name__ == "__main__":
columns = ["foo", "bar"]
to_test = pd.DataFrame(np.random.rand(50, len(columns)), columns=columns)
print(train_validate_test_split(to_test))