Merge pull request #22 from chekoduadarsh/master

Feature: Support for Seq2Seq (LSTM) model for next word prediction
AutoViML · Nov 20, 2023 · a5ca6be · a5ca6be
2 parents e960c92 + 41b5142
commit a5ca6be
Show file tree

Hide file tree

Showing 9 changed files with 1,503 additions and 12 deletions.
diff --git a/Deep_Auto_ViML_Timeseries.ipynb b/Deep_Auto_ViML_Timeseries.ipynb
diff --git a/deep_autoviml/data_load/extract.py b/deep_autoviml/data_load/extract.py
@@ -51,6 +51,11 @@
 tf.random.set_seed(42)
 from tensorflow.keras import layers
 from tensorflow import keras
+
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import TimeseriesGenerator
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+
 ############################################################################################
 #### probably the most handy function of all!
 def left_subtract(l1,l2):
@@ -1194,6 +1199,49 @@ def is_test(x, y):
 def is_train(x, y):
     return not is_test(x, y)
 ##################################################################################
+
+def load_train_timeseries(train_data_or_file, target, project_name, keras_options, model_options,
+                  keras_model_type, verbose=0):
+
+    """
+    Author: Adarsh C
+    contact: [email protected]
+
+    This Function loads the trainging data in csv format and converts into tensoflow TimeseriesGenerator. Before the conversion it splits the data for training and validation.
+
+    """
+    # Source:   https://github.com/srivatsan88/End-to-End-Time-Series/blob/master/Multivariate_Time_Series_Modeling_using_LSTM.ipynb
+    # Source_Author: https://github.com/srivatsan88
+
+    df = pd.read_csv(train_data_or_file)  # Currently supports only .csv
+
+    scaler = MinMaxScaler()
+
+    feature_data = scaler.fit_transform(df[model_options['features']]) 
+
+
+    target_data = feature_data[:,df.columns.get_loc(target)]
+
+    x_train, x_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=model_options['validation_size'], random_state=123, shuffle = False)
+    train_generator = TimeseriesGenerator(x_train, y_train, length=model_options['window_length'], sampling_rate=model_options['sampling_rate'], batch_size=keras_model_type['batch_size'], stride=model_options['stride'])
+    valid_generator = TimeseriesGenerator(x_test, y_test, length=model_options['window_length'], sampling_rate=model_options['sampling_rate'], batch_size=keras_model_type['batch_size'], stride=model_options['stride'])
+
+    ######################## Setting up Cat Vocab Dict #######################
+    cat_vocab_dict = {}
+    cat_vocab_dict['modeltype'] = 'Timeseries'
+    cat_vocab_dict['target_variables'] = target
+    cat_vocab_dict['project_name'] = project_name
+    cat_vocab_dict['model_options'] = model_options
+    cat_vocab_dict['keras_options'] = keras_options
+    cat_vocab_dict['nlp_vars'] = ""
+    cat_vocab_dict['bools'] = False
+    cat_vocab_dict['bools_converted'] = False
+    cat_vocab_dict['num_labels'] = ""
+    cat_vocab_dict['num_classes']  = ""
+
+    return train_generator, valid_generator, cat_vocab_dict
+
+##################################################################################
 def load_text_data(text_directory, project_name, keras_options, model_options,
                         verbose=0):
     """

diff --git a/deep_autoviml/deep_autoviml.py b/deep_autoviml/deep_autoviml.py
@@ -53,6 +53,7 @@
 
 #############################################################################################
 from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
+from sklearn.model_selection import train_test_split
 from IPython.core.display import Image, display
 import pickle
 #############################################################################################
@@ -84,7 +85,7 @@
 from .data_load.extract import find_problem_type, transform_train_target
 from .data_load.extract import load_train_data, load_train_data_file
 from .data_load.extract import load_train_data_frame, load_image_data
-from .data_load.extract import load_text_data
+from .data_load.extract import load_text_data, load_train_timeseries
 
 # keras preprocessing
 from .preprocessing.preprocessing import perform_preprocessing
@@ -108,6 +109,10 @@
 from .utilities.utilities import check_if_GPU_exists, plot_history
 from .utilities.utilities import save_model_architecture
 
+
+from .models import basic, dnn, reg_dnn, dnn_drop, giant_deep, cnn1, cnn2, lstm1, gru1, rnn1
+
+
 #############################################################################################
 ### Split raw_train_set into train and valid data sets first
 ### This is a better way to split a dataset into train and test ####
@@ -222,6 +227,12 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
             "image_channels": default is "" (empty string). Needed only for image use case. Number of channels.
             'save_model_path': default is project_name/keras_model_type/datetime-hour-min/
                         If you provide your own model path as a string, it will save it there.
+            "features": list: list of features from thhe input time series data (to be considered for timeseries prediciton).
+            "window_length": window length for the time series data (to be considered for timeseries prediciton).
+            "sampling_rate": sampling rate for te time series data (to be considered for timeseries prediciton).
+            "stride": stride for the time series (to be considered for timeseries prediciton)).
+            "validation_size": train and validation split ratio (to be considered for timeseries prediciton).
+            "prebuilt-model": select the pre build model from "lstm". "gru", "rnn" ( to be considered for timeseries prediciton).
     model_use_case: default is "" (empty string). If "pipeline", you will get back pipeline only, not model.
                 It is a placeholder for future purposes. At the moment, leave it as empty string.
     verbose = 1 will give you more charts and outputs. verbose 0 will run silently
@@ -266,7 +277,6 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
         os.makedirs(save_logs_path, exist_ok = True)
 
     print('Model and logs being saved in %s' %save_model_path)
-
     if keras_model_type.lower() in ['image', 'images', "image_classification"]:
         ###############   Now do special IMAGE processing here ###################################
         if 'image_directory' in model_options.keys():
@@ -323,6 +333,8 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
             print('\nSplitting train into 80+20 percent: train and validation data')
             valid_ds = full_ds.enumerate().filter(is_valid).map(recover)
             train_ds = full_ds.enumerate().filter(is_train).map(recover)
+
+
         ###################  P R E P R O C E S S    T E X T   #########################
         try:
             deep_model = preprocessing_text(train_ds, keras_model_type, model_options)
@@ -335,7 +347,62 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep
                                             project_name, save_model_flag)
         print(deep_model.summary())
         return deep_model, cat_vocab_dict
+    elif keras_model_type.lower() in ['predict time series', 'time series', "time_series", "predict_time_series"]:
+        """
+        Author: Adarsh C
+        contact: [email protected]
+        """
+        ############### Get the features columns ###################################
+        if 'features' in model_options.keys():
+            print(str(model_options['features'])+", features will be considered")
+        else:
+            print(' Must provide the features')
+            return 
+        ################   Load time series data   #########
+
+        train_generator, valid_generator, cat_vocab_dict = load_train_timeseries(
+                            train_data_or_file, target, project_name, keras_options_copy,
+                                model_options_copy, keras_model_type, verbose=verbose)
+
+
+
+        ###################  Choosing the Pre-Built model #########################
+        model = None
+        if model_options['prebuilt-model'].lower() == "lstm":
+            model = lstm1.make_lstm(model_options_copy)
+
+        elif model_options['prebuilt-model'].lower() == "rnn":
+            model = rnn1.make_rnn(model_options_copy)
+
+        elif model_options['prebuilt-model'].lower() == "gru":
+            model = gru1.make_gru(model_options_copy)
+        else:
+            print("Must choose lstm, gru, rnn in model_options['prebuilt-model'] ")
+            return
+
+        print(model.summary())
+
+        ###################  Training the Pre-Built model #########################
+
+        model.compile(loss='binary_crossentropy', optimizer=keras_options_copy['optimizer'],metrics=['acc'])
+
+        if keras_options_copy["early_stopping"]:
+            early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
+                                                                patience=2,
+                                                                mode='min')
+
+            model.fit(train_generator, epochs=keras_options_copy['epochs'],batch_size=keras_options_copy['batch_size'], 
+                        validation_data=valid_generator, 
+                        shuffle=False,
+                        callbacks=[early_stopping])
+        else:
+            model.fit(train_generator, epochs=keras_options_copy['epochs'],batch_size=keras_options_copy['batch_size'], 
+                        validation_data=valid_generator, 
+                        shuffle=False)
 
+        cat_vocab_dict['train_generator'] = train_generator
+        cat_vocab_dict['valid_generator'] = valid_generator
+        return model, cat_vocab_dict
     shuffle_flag = False
     ####   K E R A S    O P T I O N S   - THESE CAN BE OVERRIDDEN by your input keras_options dictionary ####
     keras_options_defaults = {}

diff --git a/deep_autoviml/modeling/create_model.py b/deep_autoviml/modeling/create_model.py
@@ -27,7 +27,7 @@
 from collections import defaultdict
 ############################################################################################
 # data pipelines and feature engg here
-from deep_autoviml.models import basic, dnn, reg_dnn, dnn_drop, giant_deep, cnn1, cnn2
+from deep_autoviml.models import basic, dnn, reg_dnn, dnn_drop, giant_deep, cnn1, cnn2, lstm1
 from deep_autoviml.preprocessing.preprocessing_tabular import encode_fast_inputs, create_fast_inputs
 from deep_autoviml.preprocessing.preprocessing_tabular import encode_all_inputs, create_all_inputs
 from deep_autoviml.preprocessing.preprocessing_tabular import encode_num_inputs, encode_auto_inputs
@@ -259,6 +259,7 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output
         return model_body, keras_options
     ##########################   This is for non-auto models #####################################
     if isinstance(use_my_model, str) :
+        print("models  "+keras_model_type.lower())
         if use_my_model == "":
             if keras_model_type.lower() in ['basic', 'simple', 'default','sample model']:
                 ##########  Now that we have setup the layers correctly, we can build some more hidden layers

diff --git a/deep_autoviml/modeling/predict_model.py b/deep_autoviml/modeling/predict_model.py
@@ -49,6 +49,13 @@
 from tensorflow.keras.optimizers import SGD
 from tensorflow.keras import regularizers
 
+
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import TimeseriesGenerator
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.model_selection import train_test_split
+
+
 ############################################################################################
 # data pipelines
 from deep_autoviml.data_load.classify_features import classify_features_using_pandas
@@ -290,6 +297,8 @@ def load_model_dict(model_or_model_path, cat_vocab_dict, project_name, keras_mod
             else:
                 if modeltype == 'Regression':
                     model = tf.keras.models.load_model(os.path.join(model_or_model_path))
+                if modeltype == "Timeseries":                    
+                    model = tf.keras.models.load_model(os.path.join(model_or_model_path))
                 else:
                     model = tf.keras.models.load_model(os.path.join(model_or_model_path),
                             custom_objects={'BalancedSparseCategoricalAccuracy': BalancedSparseCategoricalAccuracy})
@@ -309,6 +318,9 @@ def predict(model_or_model_path, project_name, test_dataset,
     start_time2 = time.time()
     model, cat_vocab_dict = load_model_dict(model_or_model_path, cat_vocab_dict, project_name, keras_model_type)
     ##### load the test data set here #######
+
+
+
     if keras_model_type.lower() in ['nlp', 'text']:
         NLP_VARS = cat_vocab_dict['predictors_in_train']
     else:
@@ -335,7 +347,16 @@ def combine_nlp_text(features):
                 keepdims=False, separator=' ')
         return y
     ################################################################
-    if isinstance(test_dataset, str):
+    feature_data = None
+    target_data = None
+    if keras_model_type.lower() in ['predict time series', 'time series', "time_series" "predict_time_series"]:   
+
+        scaler = MinMaxScaler()     
+        test_generator, feature_data, target_data = load_test_timeseries(
+                            test_dataset, cat_vocab_dict['target_variables'], project_name, cat_vocab_dict['keras_options'],
+                                cat_vocab_dict['model_options'], keras_model_type, scaler, verbose=verbose)
+
+    elif isinstance(test_dataset, str):
         test_ds, cat_vocab_dict2, test_small = load_test_data(test_dataset, project_name=project_name,
                                 cat_vocab_dict=cat_vocab_dict, verbose=verbose)
         ### You have to load only the NLP or text variables into dataset. otherwise, it will fail during predict
@@ -392,7 +413,7 @@ def combine_nlp_text(features):
                 print('    combined NLP or text vars: %s into a single combined_nlp_text successfully' %NLP_VARS)
         else:
             print('No NLP vars in data set. No preprocessing done.')
-        cat_vocab_dict2 = copy.deepcopy(cat_vocab_dict)
+    cat_vocab_dict2 = copy.deepcopy(cat_vocab_dict)
     ##################################################################################
     if cat_vocab_dict2['bools_converted']:
         BOOLS = []
@@ -473,12 +494,26 @@ def convert_boolean_to_string_predict(features_copy):
     num_classes = cat_vocab_dict2['num_classes']    
     ####### save the predictions only upto input size ###
     ########  This is where we start predictions on test data set ##############
-    try:
+    #try:
+    if keras_model_type.lower() in ['predict time series', 'time series', "time_series" "predict_time_series"]:        
+        predictions = model.predict_generator(test_generator)
+
+        df_pred=pd.concat([pd.DataFrame(predictions), pd.DataFrame(feature_data[:,1:][cat_vocab_dict['model_options']["length"]:])],axis=1)
+        #df_pred=pd.concat([pd.DataFrame(predictions), pd.DataFrame(x_test[:,1:][win_length:])],axis=1)
+
+        rev_trans=scaler.inverse_transform(df_pred)
+        df = pd.read_csv(test_dataset)
+
+        y_probas=df[cat_vocab_dict['model_options']['features']][predictions.shape[0]*-1:]
+
+        y_probas[str(cat_vocab_dict['target_variables'])+'_pred']=rev_trans[:,0]
+
+    else:
         y_probas = model.predict(test_ds, steps=num_steps)
-    except:
-        print('ERROR: Predictions from model erroring.')
-        print('    Check your model and ensure test data and their dtypes are same as train data and retry again.')
-        return
+    #except:
+    #    print('ERROR: Predictions from model erroring.')
+    #    print('    Check your model and ensure test data and their dtypes are same as train data and retry again.')
+    #    return
     ######  Now convert the model predictions into classes #########
     try:
         y_test_preds_list = convert_predictions_from_model(y_probas, cat_vocab_dict2, DS_LEN)
@@ -492,6 +527,29 @@ def convert_boolean_to_string_predict(features_copy):
     print('Time taken in mins for predictions = %0.0f' %((time.time()-start_time2)/60))
     return y_test_preds_list
 ############################################################################################
+
+def load_test_timeseries(test_data_or_file, target, project_name, keras_options, model_options,
+                  keras_model_type,scaler , verbose=0):
+    """
+    Author: Adarsh C
+    contact: [email protected]
+
+    This Function loads the testing data in csv format and converts into tensoflow TimeseriesGenerator.
+
+    """
+
+    # Source:   https://github.com/srivatsan88/End-to-End-Time-Series/blob/master/Multivariate_Time_Series_Modeling_using_LSTM.ipynb
+    # Source_Author: https://github.com/srivatsan88
+
+    df = pd.read_csv(test_data_or_file)
+    feature_data = scaler.fit_transform(df[model_options['features']]) 
+    target_data = feature_data[:,df.columns.get_loc(target)]
+    test_generator = TimeseriesGenerator(feature_data, target_data, length=model_options['window_length'], sampling_rate=1, batch_size=keras_options['batch_size'])
+    return test_generator, feature_data, target_data
+
+############################################################################################
+
+
 def convert_predictions_from_model(y_probas, cat_vocab_dict, DS_LEN):
     y_test_preds_list = []
     target = cat_vocab_dict['target_variables']

diff --git a/deep_autoviml/models/gru1.py b/deep_autoviml/models/gru1.py
@@ -0,0 +1,57 @@
+############################################################################################
+#Copyright 2021 Google LLC
+
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+############################################################################################
+import tensorflow as tf
+from tensorflow import keras
+#### Make sure it is Tensorflow 2.4 or greater!
+from tensorflow.keras.optimizers import SGD, Adam, RMSprop
+from tensorflow.keras import layers
+from tensorflow.keras import optimizers
+from tensorflow.keras import models
+from tensorflow.keras import callbacks
+from tensorflow.keras import backend as K
+from tensorflow.keras import utils
+from tensorflow.keras import layers
+from tensorflow.keras.layers import BatchNormalization
+from tensorflow.keras.optimizers import SGD
+from tensorflow.keras import regularizers
+from tensorflow.keras.layers import Reshape, MaxPooling1D, MaxPooling2D
+from tensorflow.keras.layers import AveragePooling2D, AveragePooling1D
+from tensorflow.keras import Model, Sequential
+from tensorflow.keras.layers import Embedding, Reshape, Dropout, Dense, GRU, LeakyReLU
+from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
+from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, Conv1D
+from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
+############################################################################################
+
+def make_gru(model_options):
+    '''
+    Author: Adarsh C
+    Date created: 30/01/2022
+    Date last modified: 30/01/2022
+    contact: [email protected]
+    Inputs:
+    model_options: contains important model hyper parameters
+    '''
+    model = tf.keras.Sequential()
+    model.add(GRU(128, input_shape= (model_options['window_length'], len(model_options['features'])), return_sequences=True))
+    model.add(LeakyReLU(alpha=0.5))
+    model.add(GRU(128, return_sequences=True))
+    model.add(LeakyReLU(alpha=0.5)) 
+    model.add(Dropout(0.3)) 
+    model.add(GRU(64, return_sequences=False))
+    model.add(Dropout(0.3)) 
+    model.add(Dense(1))
+    return model