-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #22 from chekoduadarsh/master
Feature: Support for Seq2Seq (LSTM) model for next word prediction
- Loading branch information
Showing
9 changed files
with
1,503 additions
and
12 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,11 @@ | |
tf.random.set_seed(42) | ||
from tensorflow.keras import layers | ||
from tensorflow import keras | ||
|
||
from keras.preprocessing.text import Tokenizer | ||
from keras.preprocessing.sequence import TimeseriesGenerator | ||
from sklearn.preprocessing import MinMaxScaler, StandardScaler | ||
|
||
############################################################################################ | ||
#### probably the most handy function of all! | ||
def left_subtract(l1,l2): | ||
|
@@ -1194,6 +1199,49 @@ def is_test(x, y): | |
def is_train(x, y): | ||
return not is_test(x, y) | ||
################################################################################## | ||
|
||
def load_train_timeseries(train_data_or_file, target, project_name, keras_options, model_options, | ||
keras_model_type, verbose=0): | ||
|
||
""" | ||
Author: Adarsh C | ||
contact: [email protected] | ||
This Function loads the trainging data in csv format and converts into tensoflow TimeseriesGenerator. Before the conversion it splits the data for training and validation. | ||
""" | ||
# Source: https://github.com/srivatsan88/End-to-End-Time-Series/blob/master/Multivariate_Time_Series_Modeling_using_LSTM.ipynb | ||
# Source_Author: https://github.com/srivatsan88 | ||
|
||
df = pd.read_csv(train_data_or_file) # Currently supports only .csv | ||
|
||
scaler = MinMaxScaler() | ||
|
||
feature_data = scaler.fit_transform(df[model_options['features']]) | ||
|
||
|
||
target_data = feature_data[:,df.columns.get_loc(target)] | ||
|
||
x_train, x_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=model_options['validation_size'], random_state=123, shuffle = False) | ||
train_generator = TimeseriesGenerator(x_train, y_train, length=model_options['window_length'], sampling_rate=model_options['sampling_rate'], batch_size=keras_model_type['batch_size'], stride=model_options['stride']) | ||
valid_generator = TimeseriesGenerator(x_test, y_test, length=model_options['window_length'], sampling_rate=model_options['sampling_rate'], batch_size=keras_model_type['batch_size'], stride=model_options['stride']) | ||
|
||
######################## Setting up Cat Vocab Dict ####################### | ||
cat_vocab_dict = {} | ||
cat_vocab_dict['modeltype'] = 'Timeseries' | ||
cat_vocab_dict['target_variables'] = target | ||
cat_vocab_dict['project_name'] = project_name | ||
cat_vocab_dict['model_options'] = model_options | ||
cat_vocab_dict['keras_options'] = keras_options | ||
cat_vocab_dict['nlp_vars'] = "" | ||
cat_vocab_dict['bools'] = False | ||
cat_vocab_dict['bools_converted'] = False | ||
cat_vocab_dict['num_labels'] = "" | ||
cat_vocab_dict['num_classes'] = "" | ||
|
||
return train_generator, valid_generator, cat_vocab_dict | ||
|
||
################################################################################## | ||
def load_text_data(text_directory, project_name, keras_options, model_options, | ||
verbose=0): | ||
""" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,7 @@ | |
|
||
############################################################################################# | ||
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error | ||
from sklearn.model_selection import train_test_split | ||
from IPython.core.display import Image, display | ||
import pickle | ||
############################################################################################# | ||
|
@@ -84,7 +85,7 @@ | |
from .data_load.extract import find_problem_type, transform_train_target | ||
from .data_load.extract import load_train_data, load_train_data_file | ||
from .data_load.extract import load_train_data_frame, load_image_data | ||
from .data_load.extract import load_text_data | ||
from .data_load.extract import load_text_data, load_train_timeseries | ||
|
||
# keras preprocessing | ||
from .preprocessing.preprocessing import perform_preprocessing | ||
|
@@ -108,6 +109,10 @@ | |
from .utilities.utilities import check_if_GPU_exists, plot_history | ||
from .utilities.utilities import save_model_architecture | ||
|
||
|
||
from .models import basic, dnn, reg_dnn, dnn_drop, giant_deep, cnn1, cnn2, lstm1, gru1, rnn1 | ||
|
||
|
||
############################################################################################# | ||
### Split raw_train_set into train and valid data sets first | ||
### This is a better way to split a dataset into train and test #### | ||
|
@@ -222,6 +227,12 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep | |
"image_channels": default is "" (empty string). Needed only for image use case. Number of channels. | ||
'save_model_path': default is project_name/keras_model_type/datetime-hour-min/ | ||
If you provide your own model path as a string, it will save it there. | ||
"features": list: list of features from thhe input time series data (to be considered for timeseries prediciton). | ||
"window_length": window length for the time series data (to be considered for timeseries prediciton). | ||
"sampling_rate": sampling rate for te time series data (to be considered for timeseries prediciton). | ||
"stride": stride for the time series (to be considered for timeseries prediciton)). | ||
"validation_size": train and validation split ratio (to be considered for timeseries prediciton). | ||
"prebuilt-model": select the pre build model from "lstm". "gru", "rnn" ( to be considered for timeseries prediciton). | ||
model_use_case: default is "" (empty string). If "pipeline", you will get back pipeline only, not model. | ||
It is a placeholder for future purposes. At the moment, leave it as empty string. | ||
verbose = 1 will give you more charts and outputs. verbose 0 will run silently | ||
|
@@ -266,7 +277,6 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep | |
os.makedirs(save_logs_path, exist_ok = True) | ||
|
||
print('Model and logs being saved in %s' %save_model_path) | ||
|
||
if keras_model_type.lower() in ['image', 'images', "image_classification"]: | ||
############### Now do special IMAGE processing here ################################### | ||
if 'image_directory' in model_options.keys(): | ||
|
@@ -323,6 +333,8 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep | |
print('\nSplitting train into 80+20 percent: train and validation data') | ||
valid_ds = full_ds.enumerate().filter(is_valid).map(recover) | ||
train_ds = full_ds.enumerate().filter(is_train).map(recover) | ||
|
||
|
||
################### P R E P R O C E S S T E X T ######################### | ||
try: | ||
deep_model = preprocessing_text(train_ds, keras_model_type, model_options) | ||
|
@@ -335,7 +347,62 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep | |
project_name, save_model_flag) | ||
print(deep_model.summary()) | ||
return deep_model, cat_vocab_dict | ||
elif keras_model_type.lower() in ['predict time series', 'time series', "time_series", "predict_time_series"]: | ||
""" | ||
Author: Adarsh C | ||
contact: [email protected] | ||
""" | ||
############### Get the features columns ################################### | ||
if 'features' in model_options.keys(): | ||
print(str(model_options['features'])+", features will be considered") | ||
else: | ||
print(' Must provide the features') | ||
return | ||
################ Load time series data ######### | ||
|
||
train_generator, valid_generator, cat_vocab_dict = load_train_timeseries( | ||
train_data_or_file, target, project_name, keras_options_copy, | ||
model_options_copy, keras_model_type, verbose=verbose) | ||
|
||
|
||
|
||
################### Choosing the Pre-Built model ######################### | ||
model = None | ||
if model_options['prebuilt-model'].lower() == "lstm": | ||
model = lstm1.make_lstm(model_options_copy) | ||
|
||
elif model_options['prebuilt-model'].lower() == "rnn": | ||
model = rnn1.make_rnn(model_options_copy) | ||
|
||
elif model_options['prebuilt-model'].lower() == "gru": | ||
model = gru1.make_gru(model_options_copy) | ||
else: | ||
print("Must choose lstm, gru, rnn in model_options['prebuilt-model'] ") | ||
return | ||
|
||
print(model.summary()) | ||
|
||
################### Training the Pre-Built model ######################### | ||
|
||
model.compile(loss='binary_crossentropy', optimizer=keras_options_copy['optimizer'],metrics=['acc']) | ||
|
||
if keras_options_copy["early_stopping"]: | ||
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', | ||
patience=2, | ||
mode='min') | ||
|
||
model.fit(train_generator, epochs=keras_options_copy['epochs'],batch_size=keras_options_copy['batch_size'], | ||
validation_data=valid_generator, | ||
shuffle=False, | ||
callbacks=[early_stopping]) | ||
else: | ||
model.fit(train_generator, epochs=keras_options_copy['epochs'],batch_size=keras_options_copy['batch_size'], | ||
validation_data=valid_generator, | ||
shuffle=False) | ||
|
||
cat_vocab_dict['train_generator'] = train_generator | ||
cat_vocab_dict['valid_generator'] = valid_generator | ||
return model, cat_vocab_dict | ||
shuffle_flag = False | ||
#### K E R A S O P T I O N S - THESE CAN BE OVERRIDDEN by your input keras_options dictionary #### | ||
keras_options_defaults = {} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,13 @@ | |
from tensorflow.keras.optimizers import SGD | ||
from tensorflow.keras import regularizers | ||
|
||
|
||
from keras.preprocessing.text import Tokenizer | ||
from keras.preprocessing.sequence import TimeseriesGenerator | ||
from sklearn.preprocessing import MinMaxScaler, StandardScaler | ||
from sklearn.model_selection import train_test_split | ||
|
||
|
||
############################################################################################ | ||
# data pipelines | ||
from deep_autoviml.data_load.classify_features import classify_features_using_pandas | ||
|
@@ -290,6 +297,8 @@ def load_model_dict(model_or_model_path, cat_vocab_dict, project_name, keras_mod | |
else: | ||
if modeltype == 'Regression': | ||
model = tf.keras.models.load_model(os.path.join(model_or_model_path)) | ||
if modeltype == "Timeseries": | ||
model = tf.keras.models.load_model(os.path.join(model_or_model_path)) | ||
else: | ||
model = tf.keras.models.load_model(os.path.join(model_or_model_path), | ||
custom_objects={'BalancedSparseCategoricalAccuracy': BalancedSparseCategoricalAccuracy}) | ||
|
@@ -309,6 +318,9 @@ def predict(model_or_model_path, project_name, test_dataset, | |
start_time2 = time.time() | ||
model, cat_vocab_dict = load_model_dict(model_or_model_path, cat_vocab_dict, project_name, keras_model_type) | ||
##### load the test data set here ####### | ||
|
||
|
||
|
||
if keras_model_type.lower() in ['nlp', 'text']: | ||
NLP_VARS = cat_vocab_dict['predictors_in_train'] | ||
else: | ||
|
@@ -335,7 +347,16 @@ def combine_nlp_text(features): | |
keepdims=False, separator=' ') | ||
return y | ||
################################################################ | ||
if isinstance(test_dataset, str): | ||
feature_data = None | ||
target_data = None | ||
if keras_model_type.lower() in ['predict time series', 'time series', "time_series" "predict_time_series"]: | ||
|
||
scaler = MinMaxScaler() | ||
test_generator, feature_data, target_data = load_test_timeseries( | ||
test_dataset, cat_vocab_dict['target_variables'], project_name, cat_vocab_dict['keras_options'], | ||
cat_vocab_dict['model_options'], keras_model_type, scaler, verbose=verbose) | ||
|
||
elif isinstance(test_dataset, str): | ||
test_ds, cat_vocab_dict2, test_small = load_test_data(test_dataset, project_name=project_name, | ||
cat_vocab_dict=cat_vocab_dict, verbose=verbose) | ||
### You have to load only the NLP or text variables into dataset. otherwise, it will fail during predict | ||
|
@@ -392,7 +413,7 @@ def combine_nlp_text(features): | |
print(' combined NLP or text vars: %s into a single combined_nlp_text successfully' %NLP_VARS) | ||
else: | ||
print('No NLP vars in data set. No preprocessing done.') | ||
cat_vocab_dict2 = copy.deepcopy(cat_vocab_dict) | ||
cat_vocab_dict2 = copy.deepcopy(cat_vocab_dict) | ||
################################################################################## | ||
if cat_vocab_dict2['bools_converted']: | ||
BOOLS = [] | ||
|
@@ -473,12 +494,26 @@ def convert_boolean_to_string_predict(features_copy): | |
num_classes = cat_vocab_dict2['num_classes'] | ||
####### save the predictions only upto input size ### | ||
######## This is where we start predictions on test data set ############## | ||
try: | ||
#try: | ||
if keras_model_type.lower() in ['predict time series', 'time series', "time_series" "predict_time_series"]: | ||
predictions = model.predict_generator(test_generator) | ||
|
||
df_pred=pd.concat([pd.DataFrame(predictions), pd.DataFrame(feature_data[:,1:][cat_vocab_dict['model_options']["length"]:])],axis=1) | ||
#df_pred=pd.concat([pd.DataFrame(predictions), pd.DataFrame(x_test[:,1:][win_length:])],axis=1) | ||
|
||
rev_trans=scaler.inverse_transform(df_pred) | ||
df = pd.read_csv(test_dataset) | ||
|
||
y_probas=df[cat_vocab_dict['model_options']['features']][predictions.shape[0]*-1:] | ||
|
||
y_probas[str(cat_vocab_dict['target_variables'])+'_pred']=rev_trans[:,0] | ||
|
||
else: | ||
y_probas = model.predict(test_ds, steps=num_steps) | ||
except: | ||
print('ERROR: Predictions from model erroring.') | ||
print(' Check your model and ensure test data and their dtypes are same as train data and retry again.') | ||
return | ||
#except: | ||
# print('ERROR: Predictions from model erroring.') | ||
# print(' Check your model and ensure test data and their dtypes are same as train data and retry again.') | ||
# return | ||
###### Now convert the model predictions into classes ######### | ||
try: | ||
y_test_preds_list = convert_predictions_from_model(y_probas, cat_vocab_dict2, DS_LEN) | ||
|
@@ -492,6 +527,29 @@ def convert_boolean_to_string_predict(features_copy): | |
print('Time taken in mins for predictions = %0.0f' %((time.time()-start_time2)/60)) | ||
return y_test_preds_list | ||
############################################################################################ | ||
|
||
def load_test_timeseries(test_data_or_file, target, project_name, keras_options, model_options, | ||
keras_model_type,scaler , verbose=0): | ||
""" | ||
Author: Adarsh C | ||
contact: [email protected] | ||
This Function loads the testing data in csv format and converts into tensoflow TimeseriesGenerator. | ||
""" | ||
|
||
# Source: https://github.com/srivatsan88/End-to-End-Time-Series/blob/master/Multivariate_Time_Series_Modeling_using_LSTM.ipynb | ||
# Source_Author: https://github.com/srivatsan88 | ||
|
||
df = pd.read_csv(test_data_or_file) | ||
feature_data = scaler.fit_transform(df[model_options['features']]) | ||
target_data = feature_data[:,df.columns.get_loc(target)] | ||
test_generator = TimeseriesGenerator(feature_data, target_data, length=model_options['window_length'], sampling_rate=1, batch_size=keras_options['batch_size']) | ||
return test_generator, feature_data, target_data | ||
|
||
############################################################################################ | ||
|
||
|
||
def convert_predictions_from_model(y_probas, cat_vocab_dict, DS_LEN): | ||
y_test_preds_list = [] | ||
target = cat_vocab_dict['target_variables'] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
############################################################################################ | ||
#Copyright 2021 Google LLC | ||
|
||
#Licensed under the Apache License, Version 2.0 (the "License"); | ||
#you may not use this file except in compliance with the License. | ||
#You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
#Unless required by applicable law or agreed to in writing, software | ||
#distributed under the License is distributed on an "AS IS" BASIS, | ||
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
#See the License for the specific language governing permissions and | ||
#limitations under the License. | ||
############################################################################################ | ||
import tensorflow as tf | ||
from tensorflow import keras | ||
#### Make sure it is Tensorflow 2.4 or greater! | ||
from tensorflow.keras.optimizers import SGD, Adam, RMSprop | ||
from tensorflow.keras import layers | ||
from tensorflow.keras import optimizers | ||
from tensorflow.keras import models | ||
from tensorflow.keras import callbacks | ||
from tensorflow.keras import backend as K | ||
from tensorflow.keras import utils | ||
from tensorflow.keras import layers | ||
from tensorflow.keras.layers import BatchNormalization | ||
from tensorflow.keras.optimizers import SGD | ||
from tensorflow.keras import regularizers | ||
from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D | ||
from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D | ||
from tensorflow.keras import Model, Sequential | ||
from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, GRU, LeakyReLU | ||
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D | ||
from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D | ||
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization | ||
############################################################################################ | ||
|
||
def make_gru(model_options): | ||
''' | ||
Author: Adarsh C | ||
Date created: 30/01/2022 | ||
Date last modified: 30/01/2022 | ||
contact: [email protected] | ||
Inputs: | ||
model_options: contains important model hyper parameters | ||
''' | ||
model = tf.keras.Sequential() | ||
model.add(GRU(128, input_shape= (model_options['window_length'], len(model_options['features'])), return_sequences=True)) | ||
model.add(LeakyReLU(alpha=0.5)) | ||
model.add(GRU(128, return_sequences=True)) | ||
model.add(LeakyReLU(alpha=0.5)) | ||
model.add(Dropout(0.3)) | ||
model.add(GRU(64, return_sequences=False)) | ||
model.add(Dropout(0.3)) | ||
model.add(Dense(1)) | ||
return model |
Oops, something went wrong.