Skip to content

Commit 7ea7e22

Browse files
author
goruck
committed
latest baseline
1 parent d99dffb commit 7ea7e22

File tree

2 files changed

+93
-52
lines changed

2 files changed

+93
-52
lines changed
+27-11
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,32 @@
1-
from nilm.Arguments import *
1+
#from nilm.Arguments import *
22
import numpy as np
33
import os
44
import matplotlib.pyplot as plt
55
import pandas as pd
6+
import argparse
67

7-
appliance_name = 'fridge'
8+
appliance_name = 'kettle'
89

910
#dataset = 'training'
1011
dataset = 'test'
1112
#dataset = 'validation'
1213
#dataset = 'train'
1314

14-
for filename in os.listdir(args.datadir + appliance_name):
15+
DATA_DIRECTORY = '/home/lindo/Develop/nilm/ml/dataset_management/refit/'
16+
17+
def get_arguments():
18+
parser = argparse.ArgumentParser(description='sequence to point learning \
19+
example for NILM')
20+
parser.add_argument('--data_dir', type=str, default=DATA_DIRECTORY,
21+
help='The directory containing the CLEAN REFIT data')
22+
parser.add_argument('--appliance_name', type=str, default='kettle',
23+
help='which appliance you want to train: kettle,\
24+
microwave,fridge,dishwasher,washingmachine')
25+
return parser.parse_args()
26+
27+
args = get_arguments()
28+
29+
for filename in os.listdir(args.data_dir + appliance_name):
1530
if dataset == 'train' and dataset.upper() in filename.upper() and 'TEST' in filename.upper():
1631
test_filename = filename
1732
elif dataset == 'training' and dataset.upper() in filename.upper():
@@ -23,9 +38,9 @@
2338

2439
chunksize = 10 ** 6
2540

26-
for idx, chunk in enumerate(pd.read_csv(args.datadir + appliance_name + '/' + test_filename,
41+
for idx, chunk in enumerate(pd.read_csv(args.data_dir + appliance_name + '/' + 'kettle_training_.csv',
2742
# index_col=False,
28-
names=['aggregate', appliance_name],
43+
names=['aggregate', appliance_name, 'status'],
2944
# usecols=[1, 2],
3045
# iterator=True,
3146
#skiprows=15 * 10 ** 6,
@@ -34,25 +49,26 @@
3449
)):
3550

3651
# de-normalization
37-
chunk['aggregate'] = chunk['aggregate'] * 822 + 522
38-
chunk[appliance] = chunk[appliance] * params_appliance[args.appliance_name]['std'] \
39-
+ params_appliance[args.appliance_name]['mean']
52+
#chunk['aggregate'] = chunk['aggregate'] * 822 + 522
53+
#chunk[appliance] = chunk[appliance] * params_appliance[args.appliance_name]['std'] \
54+
#+ params_appliance[args.appliance_name]['mean']
4055

4156

4257
fig = plt.figure(num='Figure {:}'.format(idx))
4358
ax1 = fig.add_subplot(111)
4459

4560
ax1.plot(chunk['aggregate'])
4661
ax1.plot(chunk[appliance_name])
62+
ax1.plot(chunk['status'])
4763

4864
ax1.grid()
4965
ax1.set_title('{:}'.format(test_filename), fontsize=14, fontweight='bold')
5066
ax1.set_ylabel('Power normalized')
5167
ax1.set_xlabel('samples')
52-
ax1.legend(['aggregate', appliance_name])
68+
ax1.legend(['aggregate', appliance_name, 'status'])
5369

5470
mng = plt.get_current_fig_manager()
5571
mng.resize(*mng.window.maxsize())
56-
plt.show(fig)
72+
plt.show()
5773

58-
del chunk
74+
del chunk

ml/dataset_management/refit/normalize_dataset.py

+66-41
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Scale datasets created by create_new_dataset.py.
1+
"""Scale datasets created by create_new_dataset.py and add on-off status.
22
33
Copyright (c) 2023 Lindo St. Angel
44
"""
@@ -59,68 +59,93 @@ def get_zscore(value, values):
5959

6060
args = parser.parse_args()
6161

62-
print(f'Target appliance: {args.appliance}')
62+
appliance = args.appliance
6363

64-
path = os.path.join(args.datadir, args.appliance)
64+
print(f'Target appliance: {appliance}')
65+
66+
path = os.path.join(args.datadir, appliance)
6567

6668
# Get statistics from training dataset.
67-
train_file_name = os.path.join(path, f'{args.appliance}_training_.csv')
69+
train_file_name = os.path.join(path, f'{appliance}_training_.csv')
6870
try:
6971
df = load(train_file_name)
72+
aggregate_power = df.loc[:, 'aggregate']
73+
appliance_power = df.loc[:, appliance]
7074

71-
# Remove outliers.
72-
#df = df[df < 10 * df.iloc[:,0].std()]
73-
74-
train_agg_mean = df.iloc[:,0].mean()
75-
train_agg_std = df.iloc[:,0].std()
75+
train_agg_mean = aggregate_power.mean()
76+
train_agg_std = aggregate_power.std()
7677
print(f'Training aggregate mean = {train_agg_mean}, std = {train_agg_std}')
7778

78-
train_app_mean = df.iloc[:,1].mean()
79-
train_app_std = df.iloc[:,1].std()
79+
train_app_mean = appliance_power.mean()
80+
train_app_std = appliance_power.std()
8081
print(f'Training appliance mean = {train_app_mean}, std = {train_app_std}')
8182

82-
train_app_min = df.iloc[:,1].min()
83-
train_app_max = df.iloc[:,1].max()
83+
train_app_min = appliance_power.min()
84+
train_app_max = appliance_power.max()
8485
print(f'Training appliance min = {train_app_min}, max = {train_app_max}')
8586

8687
del df
8788
except Exception as e:
8889
sys.exit(e)
8990

90-
# Standardize (or normalize) each dataset.
91+
max_on_power = common.params_appliance[appliance]['max_on_power']
92+
93+
# Standardize (or normalize) each dataset and add status.
9194
for _, file_name in enumerate(os.listdir(path)):
9295
file_path = os.path.join(path, file_name)
9396

9497
df = load(file_path)
9598

96-
print(f'\nStatistics for {file_name}:')
97-
print(df.iloc[:,0].describe())
98-
print(df.iloc[:,1].describe())
99-
100-
if common.USE_ALT_STANDARDIZATION:
101-
print('Using alt standardization')
99+
print(f'\n*** Working on {file_name} ***')
100+
print('Raw dataset statistics:')
101+
print(df.loc[:, 'aggregate'].describe())
102+
print(df.loc[:, appliance].describe())
103+
104+
# Limit appliance power to [0, max_on_power].
105+
print(f'Limiting appliance power to [0, {max_on_power}]')
106+
df.loc[:, appliance] = df.loc[:, appliance].clip(0, max_on_power)
107+
108+
# Get appliance status and add to end of dataframe.
109+
print('Computing on-off status.')
110+
status = common.compute_status(df.loc[:, appliance].to_numpy(), appliance)
111+
df.insert(2, 'status', status)
112+
num_on = len(df[df["status"]==1])
113+
num_off = len(df[df["status"]==0])
114+
print(f'Number of samples with on status: {num_on}')
115+
print(f'Number of samples with off status: {num_off}')
116+
assert num_on + num_off == df.iloc[:, 2].size
102117

103118
# Standardize aggregate dataset.
104119
agg_mean = common.ALT_AGGREGATE_MEAN if common.USE_ALT_STANDARDIZATION else train_agg_mean
105120
agg_std = common.ALT_AGGREGATE_STD if common.USE_ALT_STANDARDIZATION else train_agg_std
106-
print(f'\nStandardizing aggregate dataset with mean = {agg_mean} and std = {agg_std}.')
107-
df.iloc[:,0] = (df.iloc[:,0] - agg_mean) / agg_std
108-
109-
# Standardize appliance dataset.
110-
alt_app_mean = common.params_appliance[args.appliance]['alt_app_mean']
111-
alt_app_std = common.params_appliance[args.appliance]['alt_app_std']
112-
app_mean = alt_app_mean if common.USE_ALT_STANDARDIZATION else train_app_mean
113-
app_std = alt_app_std if common.USE_ALT_STANDARDIZATION else train_app_std
114-
print(f'\nStandardizing appliance dataset with mean = {app_mean} and std = {app_std}.')
115-
df.iloc[:,1] = (df.iloc[:,1] - app_mean) / app_std
121+
print(f'Standardizing aggregate dataset with mean = {agg_mean} and std = {agg_std}.')
122+
df.loc[:, 'aggregate'] = (df.loc[:, 'aggregate'] - agg_mean) / agg_std
123+
124+
# Scale appliance dataset.
125+
if common.USE_APPLIANCE_NORMALIZATION:
126+
# Normalize appliance dataset to [0, max_on_power].
127+
min = 0
128+
max = max_on_power
129+
print(f'Normalizing appliance dataset with min = {min} and max = {max}.')
130+
df.loc[:, appliance] = (df.loc[:, appliance] - min) / (max - min)
131+
else:
132+
# Standardize appliance dataset.
133+
alt_app_mean = common.params_appliance[appliance]['alt_app_mean']
134+
alt_app_std = common.params_appliance[appliance]['alt_app_std']
135+
app_mean = alt_app_mean if common.USE_ALT_STANDARDIZATION else train_app_mean
136+
app_std = alt_app_std if common.USE_ALT_STANDARDIZATION else train_app_std
137+
print('Using alt standardization.' if common.USE_ALT_STANDARDIZATION
138+
else 'Using default standardization.')
139+
print(f'Standardizing appliance dataset with mean = {app_mean} and std = {app_std}.')
140+
df.loc[:, appliance] = (df.loc[:, appliance] - app_mean) / app_std
116141

117142
### Other ways of scaling the datasets are commented out below ###
118143
### The current method seems to give the best results ###
119144

120145
# Remove outliers.
121146
# compute z-scores for all values
122147
# THIS TAKES FOREVER - DO NOT USE
123-
#df['z-score'] = df[args.appliance].apply(lambda x: get_zscore(x, df[args.appliance]))
148+
#df['z-score'] = df[appliance].apply(lambda x: get_zscore(x, df[appliance]))
124149
#outliers = df[df['z-score'] > 6]
125150
#print(outliers)
126151
#exit()
@@ -160,26 +185,26 @@ def get_zscore(value, values):
160185
# Normalize appliance dataset to [0, 1].
161186
#min = df.iloc[:,1].min()
162187
#max = df.iloc[:,1].max()
163-
#print(f'\nNormalizing appliance dataset with min = {min} and max = {max}')
164-
#df.iloc[:,1] = (df.iloc[:,1] - min) / (max - min)
188+
#print(f'Normalizing appliance dataset with min = {min} and max = {max}')
189+
#df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min)
165190

166-
print(f'\nStatistics for {file_name} after scaling:')
167-
print(df.iloc[:,0].describe())
168-
print(df.iloc[:,1].describe())
191+
print(f'Statistics for {file_name} after scaling:')
192+
print(df.loc[:, 'aggregate'].describe())
193+
print(df.loc[:, appliance].describe())
169194

170195
# Show dataset histograms.
171-
df.iloc[:,0].hist()
196+
df.loc[:, 'aggregate'].hist()
172197
plt.title(f'Histogram for {file_name} aggregate')
173198
plt.show()
174-
df.iloc[:,1].hist()
175-
plt.title(f'Histogram for {file_name} {args.appliance}')
199+
df.loc[:, appliance].hist()
200+
plt.title(f'Histogram for {file_name} {appliance}')
176201
plt.show()
177202

178203
# Check for NaNs.
179-
print(f'\nNaNs present: {df.isnull().values.any()}')
204+
print(f'NaNs present: {df.isnull().values.any()}')
180205

181206
# Save scaled dataset and overwrite existing csv.
182-
print(f'\nSaving dataset to {file_path}.')
207+
print(f'*** Saving dataset to {file_path}. ***')
183208
df.to_csv(file_path, index=False)
184209

185210
del df

0 commit comments

Comments
 (0)