1
- """Scale datasets created by create_new_dataset.py.
1
+ """Scale datasets created by create_new_dataset.py and add on-off status .
2
2
3
3
Copyright (c) 2023 Lindo St. Angel
4
4
"""
@@ -59,68 +59,93 @@ def get_zscore(value, values):
59
59
60
60
args = parser .parse_args ()
61
61
62
- print ( f'Target appliance: { args .appliance } ' )
62
+ appliance = args .appliance
63
63
64
- path = os .path .join (args .datadir , args .appliance )
64
+ print (f'Target appliance: { appliance } ' )
65
+
66
+ path = os .path .join (args .datadir , appliance )
65
67
66
68
# Get statistics from training dataset.
67
- train_file_name = os .path .join (path , f'{ args . appliance } _training_.csv' )
69
+ train_file_name = os .path .join (path , f'{ appliance } _training_.csv' )
68
70
try :
69
71
df = load (train_file_name )
72
+ aggregate_power = df .loc [:, 'aggregate' ]
73
+ appliance_power = df .loc [:, appliance ]
70
74
71
- # Remove outliers.
72
- #df = df[df < 10 * df.iloc[:,0].std()]
73
-
74
- train_agg_mean = df .iloc [:,0 ].mean ()
75
- train_agg_std = df .iloc [:,0 ].std ()
75
+ train_agg_mean = aggregate_power .mean ()
76
+ train_agg_std = aggregate_power .std ()
76
77
print (f'Training aggregate mean = { train_agg_mean } , std = { train_agg_std } ' )
77
78
78
- train_app_mean = df . iloc [:, 1 ] .mean ()
79
- train_app_std = df . iloc [:, 1 ] .std ()
79
+ train_app_mean = appliance_power .mean ()
80
+ train_app_std = appliance_power .std ()
80
81
print (f'Training appliance mean = { train_app_mean } , std = { train_app_std } ' )
81
82
82
- train_app_min = df . iloc [:, 1 ] .min ()
83
- train_app_max = df . iloc [:, 1 ] .max ()
83
+ train_app_min = appliance_power .min ()
84
+ train_app_max = appliance_power .max ()
84
85
print (f'Training appliance min = { train_app_min } , max = { train_app_max } ' )
85
86
86
87
del df
87
88
except Exception as e :
88
89
sys .exit (e )
89
90
90
- # Standardize (or normalize) each dataset.
91
+ max_on_power = common .params_appliance [appliance ]['max_on_power' ]
92
+
93
+ # Standardize (or normalize) each dataset and add status.
91
94
for _ , file_name in enumerate (os .listdir (path )):
92
95
file_path = os .path .join (path , file_name )
93
96
94
97
df = load (file_path )
95
98
96
- print (f'\n Statistics for { file_name } :' )
97
- print (df .iloc [:,0 ].describe ())
98
- print (df .iloc [:,1 ].describe ())
99
-
100
- if common .USE_ALT_STANDARDIZATION :
101
- print ('Using alt standardization' )
99
+ print (f'\n *** Working on { file_name } ***' )
100
+ print ('Raw dataset statistics:' )
101
+ print (df .loc [:, 'aggregate' ].describe ())
102
+ print (df .loc [:, appliance ].describe ())
103
+
104
+ # Limit appliance power to [0, max_on_power].
105
+ print (f'Limiting appliance power to [0, { max_on_power } ]' )
106
+ df .loc [:, appliance ] = df .loc [:, appliance ].clip (0 , max_on_power )
107
+
108
+ # Get appliance status and add to end of dataframe.
109
+ print ('Computing on-off status.' )
110
+ status = common .compute_status (df .loc [:, appliance ].to_numpy (), appliance )
111
+ df .insert (2 , 'status' , status )
112
+ num_on = len (df [df ["status" ]== 1 ])
113
+ num_off = len (df [df ["status" ]== 0 ])
114
+ print (f'Number of samples with on status: { num_on } ' )
115
+ print (f'Number of samples with off status: { num_off } ' )
116
+ assert num_on + num_off == df .iloc [:, 2 ].size
102
117
103
118
# Standardize aggregate dataset.
104
119
agg_mean = common .ALT_AGGREGATE_MEAN if common .USE_ALT_STANDARDIZATION else train_agg_mean
105
120
agg_std = common .ALT_AGGREGATE_STD if common .USE_ALT_STANDARDIZATION else train_agg_std
106
- print (f'\n Standardizing aggregate dataset with mean = { agg_mean } and std = { agg_std } .' )
107
- df .iloc [:,0 ] = (df .iloc [:,0 ] - agg_mean ) / agg_std
108
-
109
- # Standardize appliance dataset.
110
- alt_app_mean = common .params_appliance [args .appliance ]['alt_app_mean' ]
111
- alt_app_std = common .params_appliance [args .appliance ]['alt_app_std' ]
112
- app_mean = alt_app_mean if common .USE_ALT_STANDARDIZATION else train_app_mean
113
- app_std = alt_app_std if common .USE_ALT_STANDARDIZATION else train_app_std
114
- print (f'\n Standardizing appliance dataset with mean = { app_mean } and std = { app_std } .' )
115
- df .iloc [:,1 ] = (df .iloc [:,1 ] - app_mean ) / app_std
121
+ print (f'Standardizing aggregate dataset with mean = { agg_mean } and std = { agg_std } .' )
122
+ df .loc [:, 'aggregate' ] = (df .loc [:, 'aggregate' ] - agg_mean ) / agg_std
123
+
124
+ # Scale appliance dataset.
125
+ if common .USE_APPLIANCE_NORMALIZATION :
126
+ # Normalize appliance dataset to [0, max_on_power].
127
+ min = 0
128
+ max = max_on_power
129
+ print (f'Normalizing appliance dataset with min = { min } and max = { max } .' )
130
+ df .loc [:, appliance ] = (df .loc [:, appliance ] - min ) / (max - min )
131
+ else :
132
+ # Standardize appliance dataset.
133
+ alt_app_mean = common .params_appliance [appliance ]['alt_app_mean' ]
134
+ alt_app_std = common .params_appliance [appliance ]['alt_app_std' ]
135
+ app_mean = alt_app_mean if common .USE_ALT_STANDARDIZATION else train_app_mean
136
+ app_std = alt_app_std if common .USE_ALT_STANDARDIZATION else train_app_std
137
+ print ('Using alt standardization.' if common .USE_ALT_STANDARDIZATION
138
+ else 'Using default standardization.' )
139
+ print (f'Standardizing appliance dataset with mean = { app_mean } and std = { app_std } .' )
140
+ df .loc [:, appliance ] = (df .loc [:, appliance ] - app_mean ) / app_std
116
141
117
142
### Other ways of scaling the datasets are commented out below ###
118
143
### The current method seems to give the best results ###
119
144
120
145
# Remove outliers.
121
146
# compute z-scores for all values
122
147
# THIS TAKES FOREVER - DO NOT USE
123
- #df['z-score'] = df[args. appliance].apply(lambda x: get_zscore(x, df[args. appliance]))
148
+ #df['z-score'] = df[appliance].apply(lambda x: get_zscore(x, df[appliance]))
124
149
#outliers = df[df['z-score'] > 6]
125
150
#print(outliers)
126
151
#exit()
@@ -160,26 +185,26 @@ def get_zscore(value, values):
160
185
# Normalize appliance dataset to [0, 1].
161
186
#min = df.iloc[:,1].min()
162
187
#max = df.iloc[:,1].max()
163
- #print(f'\nNormalizing appliance dataset with min = {min} and max = {max}')
164
- #df.iloc[:,1] = (df.iloc[:,1] - min) / (max - min)
188
+ #print(f'Normalizing appliance dataset with min = {min} and max = {max}')
189
+ #df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min)
165
190
166
- print (f'\n Statistics for { file_name } after scaling:' )
167
- print (df .iloc [:,0 ].describe ())
168
- print (df .iloc [:,1 ].describe ())
191
+ print (f'Statistics for { file_name } after scaling:' )
192
+ print (df .loc [:, 'aggregate' ].describe ())
193
+ print (df .loc [:, appliance ].describe ())
169
194
170
195
# Show dataset histograms.
171
- df .iloc [:,0 ].hist ()
196
+ df .loc [:, 'aggregate' ].hist ()
172
197
plt .title (f'Histogram for { file_name } aggregate' )
173
198
plt .show ()
174
- df .iloc [:,1 ].hist ()
175
- plt .title (f'Histogram for { file_name } { args . appliance } ' )
199
+ df .loc [:, appliance ].hist ()
200
+ plt .title (f'Histogram for { file_name } { appliance } ' )
176
201
plt .show ()
177
202
178
203
# Check for NaNs.
179
- print (f'\n NaNs present: { df .isnull ().values .any ()} ' )
204
+ print (f'NaNs present: { df .isnull ().values .any ()} ' )
180
205
181
206
# Save scaled dataset and overwrite existing csv.
182
- print (f'\n Saving dataset to { file_path } .' )
207
+ print (f'*** Saving dataset to { file_path } . *** ' )
183
208
df .to_csv (file_path , index = False )
184
209
185
210
del df
0 commit comments