forked from NetManAIOps/Bagel
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompetion
122 lines (93 loc) · 3.98 KB
/
competion
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from model import DonutX
import pandas as pd
import numpy as np
from kpi_series import KPISeries
from sklearn.metrics import precision_recall_curve
from evaluation_metric import range_lift_with_delay
# ---- THIS IS OUR CODE ----
# read train and test data
df = pd.read_csv('./train.csv', header=0, index_col=None)
df_2 = pd.read_csv('./test.csv', header=0, index_col=None)
# get the list of KPIs
kpi_list = df_2["KPI ID"].unique().tolist()
# print(kpi_list)
# list of final kpi data
appended_data = []
# iterate through all the KPIs
for curr_kpi in kpi_list:
print("-----------------------")
print("CURRENT KPI", curr_kpi)
# get a df of current kpi for training - we only want timestamp, value and label columns
df_train = df[df["KPI ID"] == curr_kpi]
df_train = df_train[["timestamp", "value", "label"]]
# print(df_train)
# split into train and valid samples: e.g. 80%:20% ratio
# NOTE: sample data split the data in the KPI data structure
df_train, df_valid = np.split(df_train, [int(.8*len(df_train))])
# create a train KPI data structure. NOTE: it fills in missing points
train_kpi = KPISeries(
value = df_train.value,
timestamp = df_train.timestamp,
label = df_train.label,
)
# create a valid KPI data structure NOTE: it fills in missing points
valid_kpi = KPISeries(
value = df_valid.value,
timestamp = df_valid.timestamp,
label = df_valid.label,
)
# normalise the data, get the mean and standard deviation
train_kpi, train_kpi_mean, train_kpi_std = train_kpi.normalize(return_statistic=True)
valid_kpi = valid_kpi.normalize(mean=train_kpi_mean, std=train_kpi_std)
# test_kpi = test_kpi.normalize(mean=train_kpi_mean, std=train_kpi_std)
# create the model. There is also a Donut model (not DonutX)
model = DonutX(cuda=False, max_epoch=50, latent_dims=8, network_size=[100, 100])
# TRAIN THE MODEL using train_kpi and valid_kpi
# NOTE: there is also a label_sampling method (i think for supervised/unsupervised)
print("***start fitting")
# model.fit(train_kpi.label_sampling(0.), valid_kpi)
model.fit(train_kpi, valid_kpi)
print("***fitting complete")
# get a df of current kpi for testing - we only want timestamp, value columns
df_test = df_2[df_2["KPI ID"] == curr_kpi]
df_test = df_test[["timestamp", "value"]]
# print(df_test)
# create a test KPI data structure
# NOTE: it fills in missing points AND creates a label columns of 0s
test_kpi = KPISeries(
value = df_test.value,
timestamp = df_test.timestamp,
)
# normalise the test data
test_kpi = test_kpi.normalize(mean=train_kpi_mean, std=train_kpi_std)
# USE THE MODEL TO PREDICT
print("***start predicting")
predicted_labels, threshold = model.detect(test_kpi, return_threshold=True)
print("***start predicting")
# print output of prediction
print("predicted labels, threshold: ", predicted_labels, threshold)
unique, counts = np.unique(predicted_labels, return_counts=True)
print("anomaly counts: ", unique, counts)
# add the predicted labels to the test KPI
test_kpi._label = np.asarray(predicted_labels, np.int).astype(int)
# format results for outputting
final_df = pd.DataFrame()
final_df["timestamp"] = test_kpi.timestamp
final_df["KPI ID"] = curr_kpi #this can't be first because dataframe starts empty
final_df["predict"] = test_kpi.label
final_df["missing"] = test_kpi.missing
# remove rows that were missing
final_df = final_df[final_df.missing != 1]
# remove missing column
final_df = final_df.drop('missing', axis=1)
# rearrange columns for formatting
final_df = final_df.reindex(columns = ["KPI ID", "timestamp", "predict"])
# append to final results
appended_data.append(final_df)
# concat results
appended_data = pd.concat(appended_data)
# dump results to file
appended_data.to_csv('./testsubmission.csv', index=False)
# check the total anomaly count
print("total anomaly count: ")
print(np.unique(appended_data.predict, return_counts=True))