-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalc_dt_loss.py
329 lines (285 loc) · 17.1 KB
/
calc_dt_loss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import os
import pandas as pd
from typing import List, Generator
import multiprocessing as prc
import numpy as np
import math
from sqlalchemy import select
from sqlalchemy.orm import sessionmaker, scoped_session, Session
import pdb
import matplotlib.pyplot as plt
from sqlalchemy import distinct
import pandas as pd
from src.sqlalchemy.models import Session as ModelSession, Device, Frame
from sqlalchemy import create_engine
import pdb
# Directory containing the CSV files
DATA_DIR = 'data'
def seconds_to_hours_minutes_seconds(time_in_seconds):
"""
Convert a time duration from seconds to hours, minutes, and seconds.
Parameters:
time_in_seconds (int or float): The total time in seconds.
Returns:
tuple: A tuple containing time in hours, minutes, and seconds.
"""
# Calculate hours, minutes, and seconds
hours, remainder = divmod(time_in_seconds, 3600)
minutes, seconds = divmod(remainder, 60)
return hours, minutes, seconds
def calculate_data_collection_time(sampling_rate, number_of_samples):
"""
Calculate the total data collection time.
Parameters:
sampling_rate (int or float): The number of samples per second.
number_of_samples (int): The total number of samples collected.
Returns:
float: The total data collection time in seconds.
"""
# Total time is the number of samples divided by the rate (samples per second)
# which gives the total time in seconds.
total_time_seconds = number_of_samples / float(sampling_rate)
return total_time_seconds
def load_csv_files(data_dir):
# List to hold data from all CSV files
all_data = []
results_table: List = []
# Iterate through all files in the data directory
for filename in os.listdir(data_dir):
if filename.endswith('.csv'):
n_duplicates: int = 0
n_missed_packets: int = 0
seq_rest: int = 0
loss_points: int = 0
total_loss_time: int = 0
avg_break_time: List = []
buffer_full_event = 0
buffer_full_ts = []
time_resets_count: int = 0
time_resets_ts = []
buffer_lost_frames: int = 0
seq_loss: int = 0
total_pack_seq: int = 0
seq_loss_ts: List = []
buffer_full_idx: List = []
seq_loss_idx: List = []
# Construct the full file path
file_path = os.path.join(data_dir, filename)
# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)
# Extract device_id and date from the filename
parts = filename.split('_')
device_id = parts[1]
date_str = parts[2].replace('.csv', '')
# Add device_id and date to the DataFrame
#df['date'] = date_str
# Add the DataFrame to the list
#all_data.append(df)
sequences = df['seq'].values
timestamps = df['timestamp'].values
sampling_rate = 50
log = True
max_seq_number = 2**12 - 1
total_number_of_packets = len(sequences)
total_pack_seq = len(sequences)
timestamps = np.array(timestamps) * 10**-6 # to seconds
# Applies numpy diff to sequence numbers to check if there are any missing packages
differences: List[int] = np.diff(sequences).tolist()
sampling_period: List[int] = np.diff(timestamps).tolist()
idx_neg = np.where(np.array(sampling_period) < 0)[0]
if log:
if len(idx_neg) > 0:
plt.figure()
plt.title(str(device_id) + "neg sp")
for index in idx_neg:
plt.axvline(x=index, color='r', linestyle='--') # Plots a vertical line at each index
#plt.plot(timestamps, ".")
plt.plot(timestamps, ".")
plt.ylabel("timestamp (s)")
plt.show()
# Goes over all differences and corresponding timestamps and counts data loss
#print(f"unique sequences: {len(np.unique(sequences))}, total size", {len(sequences)})
for i, diff in enumerate(differences):
# Counts number of frames based on the elapsed time and counts number of possible full loops
if sampling_period[i] > 0:
n_frames_passed = math.floor((timestamps[i + 1] - timestamps[i]) * sampling_rate)
else:
n_frames_passed = 0
n_full_loops = 0 if n_frames_passed == 1 else n_frames_passed // max_seq_number
#if log:
# print(f"INFO: index: {i} | diff: {diff} | n_full_loops: {n_full_loops} | missed_packets: {n_missed_packets} | frame: {copy_frames[i]}")
if diff != 1 and diff != -max_seq_number: # If is not one, then we jumped more than one frame and so, we have an error
if diff == 0: # Has a duplicate if the timestamp is the same or is a full round loss
if len(np.unique(timestamps[i])) > 1:
n_duplicates += 1
elif n_full_loops > 0:
if log:
print(f"ERROR: got full loop and added: {n_full_loops * max_seq_number}")
missed = max_seq_number * n_full_loops + (max_seq_number - sequences[i]) + (sequences[i+1] - 0) # We lost an entire set of frames
n_missed_packets += missed
total_number_of_packets += missed
loss_points += 1
if sampling_period[i] > 0:
total_loss_time += timestamps[i+1] - timestamps[i] #missed / sampling_rate
avg_break_time += [timestamps[i+1] - timestamps[i]]
seq_loss_ts += [timestamps[i+1] - timestamps[i]]
else:
total_loss_time += missed / sampling_rate # 1 4: 2, 3 -> 0.01*2
avg_break_time += [missed / sampling_rate]
seq_loss_ts += [missed / sampling_rate]
seq_loss += missed
total_pack_seq += missed
seq_loss_idx += [i+1]
elif diff > 1: # We missed some packets but it did not go back to sequence 0
# perderam-se pacotes, disp perdeu ligação router/odroid + voltou-se a ligar - num pacote reinicia, timsestamp não volta a 0
# perdeou lig ao router ou odroid, num seq
# buffer sockets re-escrito, bloqueio
# não perdemos pacotes tcp, não sabemos o que está dentro tcp pacote. o que envio - recebo.
# problema: num pacotes não é o do tcp, é uma layer nossa a nivel de frame. tcp tem várias frames
# erro quantidade dados, communicação perde-se comunicação
if log:
print(f"ERROR: Missed some packets: {diff - 1}")
missed = diff - 1
loss_points += 1
if sampling_period[i] > 0:
lost_time = timestamps[i+1] - timestamps[i]
#missed = lost_time / sampling_rate
missed = math.floor((timestamps[i + 1] - timestamps[i]) * sampling_rate)
total_loss_time += timestamps[i+1] - timestamps[i]
avg_break_time += [timestamps[i+1] - timestamps[i]]
seq_loss_ts += [timestamps[i+1] - timestamps[i]]
else:
#lost_time = missed / sampling_rate
total_loss_time += missed / sampling_rate
avg_break_time += [ missed / sampling_rate]
seq_loss_ts += [missed / sampling_rate]
n_missed_packets += missed
total_number_of_packets += missed
seq_loss += missed
total_pack_seq += missed
seq_loss_idx += [i+1]
elif diff < 0:
if sequences[i+1] == 0 and sequences[i] == max_seq_number: # device turn off, don't know for how long
seq_rest += 1 # break with no data loss, reinicia seq number
else: # We came back to sequence 0 (i.e: (15 - 12) + (7 - 0))
seq_rest += 1 # break with no data loss, reinicia seq number
if log:
print(f"ERROR: Missed packets in break: {(max_seq_number - sequences[i]) + (sequences[i+1] - 0)}")
missed = (max_seq_number - sequences[i]) + (sequences[i+1] - 0)
n_missed_packets += missed
loss_points += 1
total_loss_time += missed / sampling_rate
avg_break_time += [ missed / sampling_rate]
total_number_of_packets += missed
seq_loss += missed
total_pack_seq += missed
seq_loss_ts += [missed / sampling_rate]
seq_loss_idx += [i+1]
else:
if sampling_period[i] > 2*(1/sampling_rate): # there was loss without change in packet number
# can not send or receive data at high speed enough, buffer becomes full -> aquisition stops -> data loss not in transmission but in aquisition
if diff == 1:
# buffer full: disp ficou parado à espera de enviar dados - não consegue enviar dados a tempo, fica sem espaço, para aquisição
# poder ser lado odroid
# pode ser bad wifi
# pode ser disp não consegue manter 100hz - non probable
# odroid vai buscar dados ao disp, odroid vai buscar em int reg antes buffer ficar cheio
# não se perderam frames na aquisição, ha espera que odroid fosse buscar dados - ocupados com outros disp, a escrever na bd, chip wifi demorou, router baixou taxa de dados transmitidos
buffer_full_event += 1 # max # min, não recolhe dados por ter buffer cheio
buffer_full_ts += [timestamps[i+1] - timestamps[i]] # max # min, não recolhe dados por ter buffer cheio
missed = math.floor((timestamps[i + 1] - timestamps[i]) * sampling_rate)
n_missed_packets += missed
loss_points += 1
total_loss_time += timestamps[i + 1] - timestamps[i]
avg_break_time += [timestamps[i + 1] - timestamps[i]]
total_number_of_packets += missed
buffer_lost_frames += missed
buffer_full_idx += [i+1]
if log:
if len(buffer_full_idx) > 0:
plt.figure()
plt.title(str(device_id) + " buffer full")
for index in buffer_full_idx:
plt.axvline(x=index, color='r', linestyle='--') # Plots a vertical line at each index
#plt.plot(timestamps, ".")
plt.plot(timestamps, ".")
plt.ylabel("timestamps")
plt.show()
sampling_period = np.array(sampling_period)
time_resets = np.where(sampling_period < 0)[0] +1
time_resets_count = len(time_resets)
time_resets_ts = [sampling_period[i] for i in time_resets]
sampling_period = [num for num in sampling_period if num >= 0]
collection_time = calculate_data_collection_time(sampling_rate, len(sequences))
hours, minutes, seconds = seconds_to_hours_minutes_seconds(collection_time)
exp_hours, exp_minutes, exp_seconds = seconds_to_hours_minutes_seconds(1)
if len(avg_break_time) == 0:
avg_break_time = [0]
if n_missed_packets == 0:
assert (np.unique(differences).shape[0] == 1 or np.unique(differences).shape[0] == 2) or seq_rest != 0# allow for 1 or 2**12-1
# Perform calculations
missed_packet_percent = round(n_missed_packets / total_number_of_packets * 100, 2)
seq_loss_perc = round(seq_loss / total_pack_seq * 100, 2)
duplicate_packet = n_duplicates
# Format times as strings
time_string = f"{hours}:{minutes}:{seconds}"
exp_time_string = f"{exp_hours}:{exp_minutes}:{exp_seconds}"
mean_break_time = round(np.mean(avg_break_time), 5)
std_break_time = round(np.std(avg_break_time), 5)
mean_sampling_period = round(np.mean(sampling_period), 5)
std_sampling_period = round(np.std(sampling_period), 5)
max_sampling_period = round(np.max(sampling_period), 5)
min_sampling_period = round(np.min(sampling_period), 5)
if len(buffer_full_ts) > 0:
buffer_full_ts_mean = round(np.mean(buffer_full_ts), 5)
buffer_full_ts_std = round(np.std(buffer_full_ts), 5)
max_buffer_full_ts = round(np.max(buffer_full_ts), 5)
min_buffer_full_ts = round(np.min(buffer_full_ts), 5)
else:
buffer_full_ts_mean, buffer_full_ts_std, max_buffer_full_ts, min_buffer_full_ts = 0,0,0,0
if len(time_resets_ts) > 0:
max_time_resets_ts = round(np.max(time_resets_ts), 5)
min_time_resets_ts = round(np.min(time_resets_ts), 5)
mean_time_resets_ts = round(np.mean(time_resets_ts), 5)
std_time_resets_ts = round(np.std(time_resets_ts), 5)
else:
max_time_resets_ts, min_time_resets_ts, mean_time_resets_ts, std_time_resets_ts = 0,0,0,0
seq_loss_time = round(np.sum(seq_loss_ts), 5)
if log:
if len(seq_loss_idx) > 0:
plt.figure()
plt.title(str(device_id) + " seq_loss_idx")
for index in seq_loss_idx:
plt.axvline(x=index, color='r', linestyle='--') # Plots a vertical line at each index
#plt.plot(timestamps, ".")
plt.plot(timestamps, ".")
plt.ylabel("timestamps")
plt.show()
plt.close()
# Append data to results_table
results_table.append([
1, device_id, missed_packet_percent, n_missed_packets, total_number_of_packets, loss_points, total_loss_time, f"{mean_break_time} +- {std_break_time}",
duplicate_packet, time_string, exp_time_string,
seq_rest,seq_loss, seq_loss_perc, seq_loss_time, sampling_rate,
mean_sampling_period, std_sampling_period, max_sampling_period, min_sampling_period,
buffer_full_event, buffer_lost_frames, np.sum(buffer_full_ts), f"{buffer_full_ts_mean} +- {buffer_full_ts_std}",
max_buffer_full_ts, min_buffer_full_ts, time_resets_count, max_time_resets_ts, min_time_resets_ts, mean_time_resets_ts, std_time_resets_ts
])
print(f"SESSION: {1}; [Device = {device_id}] -> Missed packets = {n_missed_packets} (#) | Data Loss {missed_packet_percent} (%)")
# Concatenate all DataFrames into a single DataFrame
#combined_df = pd.concat(all_data, ignore_index=True)
#return combined_df
return results_table
# Load CSV files and combine the data
results_table = load_csv_files(DATA_DIR)
df = pd.DataFrame.from_dict(results_table)
header = ['session_id', 'device', 'data loss (%)', 'lost frames (#)', 'total frames (#)', 'loss events (#)', 'total loss time (s)', 'avg loss time (s)', 'duplicates (#)', 'obt. duration (h:m:s)', \
'exp. duration (h:m:s)', 'Seq Rest (#)', "Seq Loss (#)", "Seq Loss (%)", "Total Seq Loss (s)", 'exp sampling_rate (Hz)',\
'avg sampling period (s)', 'std sampling period (s)', 'max sampling period (s)', 'min sampling period (s)', "buffer full (#)", "buffer lost frames (#)", "total time buffer full (s)",\
"buffer full avg (ts)", "buffer full max (ts)", "buffer full min (ts)", "time resets (#)", "time resets max (ts)", "time resets min (ts)", "time resets avg (ts)", "time resets std (ts)"]
averages = df.mean(numeric_only=True) # get the average of each numeric column
std_devs = df.std(numeric_only=True) # get the standard deviation of each numeric column
summary_row = ['---' if col not in averages.index else f"{averages[col]:.2f} ± {std_devs[col]:.2f}" for col in df.columns]
df.loc['Average'] = summary_row
df.to_csv ('results/db_results.csv', index = True, header=header)
pdb.set_trace()