Skip to content

Commit

Permalink
Merge pull request #22 from chekoduadarsh/master
Browse files Browse the repository at this point in the history
Feature: Support for Seq2Seq (LSTM) model for next word prediction
  • Loading branch information
AutoViML authored Nov 20, 2023
2 parents e960c92 + 41b5142 commit a5ca6be
Show file tree
Hide file tree
Showing 9 changed files with 1,503 additions and 12 deletions.
1,140 changes: 1,140 additions & 0 deletions Deep_Auto_ViML_Timeseries.ipynb

Large diffs are not rendered by default.

48 changes: 48 additions & 0 deletions deep_autoviml/data_load/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@
tf.random.set_seed(42)
from tensorflow.keras import layers
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler, StandardScaler

############################################################################################
#### probably the most handy function of all!
def left_subtract(l1,l2):
Expand Down Expand Up @@ -1194,6 +1199,49 @@ def is_test(x, y):
def is_train(x, y):
return not is_test(x, y)
##################################################################################

def load_train_timeseries(train_data_or_file, target, project_name, keras_options, model_options,
keras_model_type, verbose=0):

"""
Author: Adarsh C
contact: [email protected]
This Function loads the trainging data in csv format and converts into tensoflow TimeseriesGenerator. Before the conversion it splits the data for training and validation.
"""
# Source: https://github.com/srivatsan88/End-to-End-Time-Series/blob/master/Multivariate_Time_Series_Modeling_using_LSTM.ipynb
# Source_Author: https://github.com/srivatsan88

df = pd.read_csv(train_data_or_file) # Currently supports only .csv

scaler = MinMaxScaler()

feature_data = scaler.fit_transform(df[model_options['features']])


target_data = feature_data[:,df.columns.get_loc(target)]

x_train, x_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=model_options['validation_size'], random_state=123, shuffle = False)
train_generator = TimeseriesGenerator(x_train, y_train, length=model_options['window_length'], sampling_rate=model_options['sampling_rate'], batch_size=keras_model_type['batch_size'], stride=model_options['stride'])
valid_generator = TimeseriesGenerator(x_test, y_test, length=model_options['window_length'], sampling_rate=model_options['sampling_rate'], batch_size=keras_model_type['batch_size'], stride=model_options['stride'])

######################## Setting up Cat Vocab Dict #######################
cat_vocab_dict = {}
cat_vocab_dict['modeltype'] = 'Timeseries'
cat_vocab_dict['target_variables'] = target
cat_vocab_dict['project_name'] = project_name
cat_vocab_dict['model_options'] = model_options
cat_vocab_dict['keras_options'] = keras_options
cat_vocab_dict['nlp_vars'] = ""
cat_vocab_dict['bools'] = False
cat_vocab_dict['bools_converted'] = False
cat_vocab_dict['num_labels'] = ""
cat_vocab_dict['num_classes'] = ""

return train_generator, valid_generator, cat_vocab_dict

##################################################################################
def load_text_data(text_directory, project_name, keras_options, model_options,
verbose=0):
"""
Expand Down
71 changes: 69 additions & 2 deletions deep_autoviml/deep_autoviml.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@

#############################################################################################
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from IPython.core.display import Image, display
import pickle
#############################################################################################
Expand Down Expand Up @@ -84,7 +85,7 @@
from .data_load.extract import find_problem_type, transform_train_target
from .data_load.extract import load_train_data, load_train_data_file
from .data_load.extract import load_train_data_frame, load_image_data
from .data_load.extract import load_text_data
from .data_load.extract import load_text_data, load_train_timeseries

# keras preprocessing
from .preprocessing.preprocessing import perform_preprocessing
Expand All @@ -108,6 +109,10 @@
from .utilities.utilities import check_if_GPU_exists, plot_history
from .utilities.utilities import save_model_architecture


from .models import basic, dnn, reg_dnn, dnn_drop, giant_deep, cnn1, cnn2, lstm1, gru1, rnn1


#############################################################################################
### Split raw_train_set into train and valid data sets first
### This is a better way to split a dataset into train and test ####
Expand Down Expand Up @@ -222,6 +227,12 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
"image_channels": default is "" (empty string). Needed only for image use case. Number of channels.
'save_model_path': default is project_name/keras_model_type/datetime-hour-min/
If you provide your own model path as a string, it will save it there.
"features": list: list of features from thhe input time series data (to be considered for timeseries prediciton).
"window_length": window length for the time series data (to be considered for timeseries prediciton).
"sampling_rate": sampling rate for te time series data (to be considered for timeseries prediciton).
"stride": stride for the time series (to be considered for timeseries prediciton)).
"validation_size": train and validation split ratio (to be considered for timeseries prediciton).
"prebuilt-model": select the pre build model from "lstm". "gru", "rnn" ( to be considered for timeseries prediciton).
model_use_case: default is "" (empty string). If "pipeline", you will get back pipeline only, not model.
It is a placeholder for future purposes. At the moment, leave it as empty string.
verbose = 1 will give you more charts and outputs. verbose 0 will run silently
Expand Down Expand Up @@ -266,7 +277,6 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
os.makedirs(save_logs_path, exist_ok = True)

print('Model and logs being saved in %s' %save_model_path)

if keras_model_type.lower() in ['image', 'images', "image_classification"]:
############### Now do special IMAGE processing here ###################################
if 'image_directory' in model_options.keys():
Expand Down Expand Up @@ -323,6 +333,8 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
print('\nSplitting train into 80+20 percent: train and validation data')
valid_ds = full_ds.enumerate().filter(is_valid).map(recover)
train_ds = full_ds.enumerate().filter(is_train).map(recover)


################### P R E P R O C E S S T E X T #########################
try:
deep_model = preprocessing_text(train_ds, keras_model_type, model_options)
Expand All @@ -335,7 +347,62 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
project_name, save_model_flag)
print(deep_model.summary())
return deep_model, cat_vocab_dict
elif keras_model_type.lower() in ['predict time series', 'time series', "time_series", "predict_time_series"]:
"""
Author: Adarsh C
contact: [email protected]
"""
############### Get the features columns ###################################
if 'features' in model_options.keys():
print(str(model_options['features'])+", features will be considered")
else:
print(' Must provide the features')
return
################ Load time series data #########

train_generator, valid_generator, cat_vocab_dict = load_train_timeseries(
train_data_or_file, target, project_name, keras_options_copy,
model_options_copy, keras_model_type, verbose=verbose)



################### Choosing the Pre-Built model #########################
model = None
if model_options['prebuilt-model'].lower() == "lstm":
model = lstm1.make_lstm(model_options_copy)

elif model_options['prebuilt-model'].lower() == "rnn":
model = rnn1.make_rnn(model_options_copy)

elif model_options['prebuilt-model'].lower() == "gru":
model = gru1.make_gru(model_options_copy)
else:
print("Must choose lstm, gru, rnn in model_options['prebuilt-model'] ")
return

print(model.summary())

################### Training the Pre-Built model #########################

model.compile(loss='binary_crossentropy', optimizer=keras_options_copy['optimizer'],metrics=['acc'])

if keras_options_copy["early_stopping"]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
patience=2,
mode='min')

model.fit(train_generator, epochs=keras_options_copy['epochs'],batch_size=keras_options_copy['batch_size'],
validation_data=valid_generator,
shuffle=False,
callbacks=[early_stopping])
else:
model.fit(train_generator, epochs=keras_options_copy['epochs'],batch_size=keras_options_copy['batch_size'],
validation_data=valid_generator,
shuffle=False)

cat_vocab_dict['train_generator'] = train_generator
cat_vocab_dict['valid_generator'] = valid_generator
return model, cat_vocab_dict
shuffle_flag = False
#### K E R A S O P T I O N S - THESE CAN BE OVERRIDDEN by your input keras_options dictionary ####
keras_options_defaults = {}
Expand Down
3 changes: 2 additions & 1 deletion deep_autoviml/modeling/create_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from collections import defaultdict
############################################################################################
# data pipelines and feature engg here
from deep_autoviml.models import basic, dnn, reg_dnn, dnn_drop, giant_deep, cnn1, cnn2
from deep_autoviml.models import basic, dnn, reg_dnn, dnn_drop, giant_deep, cnn1, cnn2, lstm1
from deep_autoviml.preprocessing.preprocessing_tabular import encode_fast_inputs, create_fast_inputs
from deep_autoviml.preprocessing.preprocessing_tabular import encode_all_inputs, create_all_inputs
from deep_autoviml.preprocessing.preprocessing_tabular import encode_num_inputs, encode_auto_inputs
Expand Down Expand Up @@ -259,6 +259,7 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output
return model_body, keras_options
########################## This is for non-auto models #####################################
if isinstance(use_my_model, str) :
print("models "+keras_model_type.lower())
if use_my_model == "":
if keras_model_type.lower() in ['basic', 'simple', 'default','sample model']:
########## Now that we have setup the layers correctly, we can build some more hidden layers
Expand Down
72 changes: 65 additions & 7 deletions deep_autoviml/modeling/predict_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split


############################################################################################
# data pipelines
from deep_autoviml.data_load.classify_features import classify_features_using_pandas
Expand Down Expand Up @@ -290,6 +297,8 @@ def load_model_dict(model_or_model_path, cat_vocab_dict, project_name, keras_mod
else:
if modeltype == 'Regression':
model = tf.keras.models.load_model(os.path.join(model_or_model_path))
if modeltype == "Timeseries":
model = tf.keras.models.load_model(os.path.join(model_or_model_path))
else:
model = tf.keras.models.load_model(os.path.join(model_or_model_path),
custom_objects={'BalancedSparseCategoricalAccuracy': BalancedSparseCategoricalAccuracy})
Expand All @@ -309,6 +318,9 @@ def predict(model_or_model_path, project_name, test_dataset,
start_time2 = time.time()
model, cat_vocab_dict = load_model_dict(model_or_model_path, cat_vocab_dict, project_name, keras_model_type)
##### load the test data set here #######



if keras_model_type.lower() in ['nlp', 'text']:
NLP_VARS = cat_vocab_dict['predictors_in_train']
else:
Expand All @@ -335,7 +347,16 @@ def combine_nlp_text(features):
keepdims=False, separator=' ')
return y
################################################################
if isinstance(test_dataset, str):
feature_data = None
target_data = None
if keras_model_type.lower() in ['predict time series', 'time series', "time_series" "predict_time_series"]:

scaler = MinMaxScaler()
test_generator, feature_data, target_data = load_test_timeseries(
test_dataset, cat_vocab_dict['target_variables'], project_name, cat_vocab_dict['keras_options'],
cat_vocab_dict['model_options'], keras_model_type, scaler, verbose=verbose)

elif isinstance(test_dataset, str):
test_ds, cat_vocab_dict2, test_small = load_test_data(test_dataset, project_name=project_name,
cat_vocab_dict=cat_vocab_dict, verbose=verbose)
### You have to load only the NLP or text variables into dataset. otherwise, it will fail during predict
Expand Down Expand Up @@ -392,7 +413,7 @@ def combine_nlp_text(features):
print(' combined NLP or text vars: %s into a single combined_nlp_text successfully' %NLP_VARS)
else:
print('No NLP vars in data set. No preprocessing done.')
cat_vocab_dict2 = copy.deepcopy(cat_vocab_dict)
cat_vocab_dict2 = copy.deepcopy(cat_vocab_dict)
##################################################################################
if cat_vocab_dict2['bools_converted']:
BOOLS = []
Expand Down Expand Up @@ -473,12 +494,26 @@ def convert_boolean_to_string_predict(features_copy):
num_classes = cat_vocab_dict2['num_classes']
####### save the predictions only upto input size ###
######## This is where we start predictions on test data set ##############
try:
#try:
if keras_model_type.lower() in ['predict time series', 'time series', "time_series" "predict_time_series"]:
predictions = model.predict_generator(test_generator)

df_pred=pd.concat([pd.DataFrame(predictions), pd.DataFrame(feature_data[:,1:][cat_vocab_dict['model_options']["length"]:])],axis=1)
#df_pred=pd.concat([pd.DataFrame(predictions), pd.DataFrame(x_test[:,1:][win_length:])],axis=1)

rev_trans=scaler.inverse_transform(df_pred)
df = pd.read_csv(test_dataset)

y_probas=df[cat_vocab_dict['model_options']['features']][predictions.shape[0]*-1:]

y_probas[str(cat_vocab_dict['target_variables'])+'_pred']=rev_trans[:,0]

else:
y_probas = model.predict(test_ds, steps=num_steps)
except:
print('ERROR: Predictions from model erroring.')
print(' Check your model and ensure test data and their dtypes are same as train data and retry again.')
return
#except:
# print('ERROR: Predictions from model erroring.')
# print(' Check your model and ensure test data and their dtypes are same as train data and retry again.')
# return
###### Now convert the model predictions into classes #########
try:
y_test_preds_list = convert_predictions_from_model(y_probas, cat_vocab_dict2, DS_LEN)
Expand All @@ -492,6 +527,29 @@ def convert_boolean_to_string_predict(features_copy):
print('Time taken in mins for predictions = %0.0f' %((time.time()-start_time2)/60))
return y_test_preds_list
############################################################################################

def load_test_timeseries(test_data_or_file, target, project_name, keras_options, model_options,
keras_model_type,scaler , verbose=0):
"""
Author: Adarsh C
contact: [email protected]
This Function loads the testing data in csv format and converts into tensoflow TimeseriesGenerator.
"""

# Source: https://github.com/srivatsan88/End-to-End-Time-Series/blob/master/Multivariate_Time_Series_Modeling_using_LSTM.ipynb
# Source_Author: https://github.com/srivatsan88

df = pd.read_csv(test_data_or_file)
feature_data = scaler.fit_transform(df[model_options['features']])
target_data = feature_data[:,df.columns.get_loc(target)]
test_generator = TimeseriesGenerator(feature_data, target_data, length=model_options['window_length'], sampling_rate=1, batch_size=keras_options['batch_size'])
return test_generator, feature_data, target_data

############################################################################################


def convert_predictions_from_model(y_probas, cat_vocab_dict, DS_LEN):
y_test_preds_list = []
target = cat_vocab_dict['target_variables']
Expand Down
57 changes: 57 additions & 0 deletions deep_autoviml/models/gru1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
############################################################################################
#Copyright 2021 Google LLC

#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
############################################################################################
import tensorflow as tf
from tensorflow import keras
#### Make sure it is Tensorflow 2.4 or greater!
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import models
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from tensorflow.keras import layers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, GRU, LeakyReLU
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
############################################################################################

def make_gru(model_options):
'''
Author: Adarsh C
Date created: 30/01/2022
Date last modified: 30/01/2022
contact: [email protected]
Inputs:
model_options: contains important model hyper parameters
'''
model = tf.keras.Sequential()
model.add(GRU(128, input_shape= (model_options['window_length'], len(model_options['features'])), return_sequences=True))
model.add(LeakyReLU(alpha=0.5))
model.add(GRU(128, return_sequences=True))
model.add(LeakyReLU(alpha=0.5))
model.add(Dropout(0.3))
model.add(GRU(64, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(1))
return model
Loading

0 comments on commit a5ca6be

Please sign in to comment.