apache · szha · Jul 21, 2019 · May 30, 2019
@@ -19,6 +19,7 @@
 
 # -*- coding: utf-8 -*-
 
+import logging
 import mxnet as mx
 import numpy as np
 import pickle
@@ -43,17 +44,17 @@ def classifer_metrics(label, pred):
     corr_pred = (prediction == label) == (pred_is_entity == True)
 
     #how many entities are there?
-    num_entities = np.sum(label_is_entity)
-    entity_preds = np.sum(pred_is_entity)
-
+    # better to cast to float for safer further ratio computations
+    num_entities = float(np.sum(label_is_entity))
+    entity_preds = float(np.sum(pred_is_entity)) 
     #how many times did we correctly predict an entity?
-    correct_entitites = np.sum(corr_pred[pred_is_entity])
+    correct_entitites = float(np.sum(corr_pred[pred_is_entity]))
 
     #precision: when we predict entity, how often are we right?
     if entity_preds == 0:
         precision = np.nan
     else:
-        precision = correct_entitites/entity_preds
+        precision = correct_entitites / entity_preds
 
     #recall: of the things that were an entity, how many did we catch?
     recall = correct_entitites / num_entities
@@ -64,6 +65,8 @@ def classifer_metrics(label, pred):
         f1 = 0
     else:
         f1 = 2 * precision * recall / (precision + recall)
+
+    logging.debug("Metrics results: precision=%f recall=%f f1=%f", precision, recall, f1)
     return precision, recall, f1
 
 def entity_precision(label, pred):

@@ -93,6 +93,7 @@ def build_vocab(nested_list):
     """
     # Build vocabulary
     word_counts = Counter(itertools.chain(*nested_list))
+    logging.info("build_vocab: word_counts=%d" % (len(word_counts)))
 
     # Mapping from index to label
     vocabulary_inv = [x[0] for x in word_counts.most_common()]
@@ -114,6 +115,7 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None)
     :param buckets: size of each bucket in the iterators
     :return: train_iter, val_iter, word_to_index, index_to_word, pos_to_index, index_to_pos
     """
+
     # Read in data as numpy array
     df = pd.read_pickle(os.path.join(data_dir, "ner_data.pkl"))[:max_records]
 
@@ -135,12 +137,14 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None)
 
     # Split into training and testing data
     idx=int(len(indexed_tokens)*train_fraction)
+    logging.info("Preparing train/test datasets splitting at idx %d on total %d sentences using a batchsize of %d", idx, len(indexed_tokens), batch_size)
     X_token_train, X_char_train, Y_train = indexed_tokens[:idx], indexed_chars[:idx], indexed_entities[:idx]
     X_token_test, X_char_test, Y_test = indexed_tokens[idx:], indexed_chars[idx:], indexed_entities[idx:]
 
     # build iterators to feed batches to network
     train_iter = iterators.BucketNerIter(sentences=X_token_train, characters=X_char_train, label=Y_train,
                                          max_token_chars=5, batch_size=batch_size, buckets=buckets)
+    logging.info("Creating the val_iter using %d sentences", len(X_token_test))
     val_iter = iterators.BucketNerIter(sentences=X_token_test, characters=X_char_test, label=Y_test,
                                          max_token_chars=train_iter.max_token_chars, batch_size=batch_size, buckets=train_iter.buckets)
     return train_iter, val_iter, word_to_index, char_to_index, entity_to_index
@@ -205,6 +209,8 @@ def sym_gen(seq_len):
 def train(train_iter, val_iter):
     import metrics
     devs = mx.cpu() if args.gpus is None or args.gpus is '' else [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    logging.info("train on device %s using optimizer %s at learningrate %f for %d epochs using %d records: lstm_state_size=%d ...",
+          devs, args.optimizer, args.lr, args.num_epochs, args.max_records, args.lstm_state_size)
     module = mx.mod.BucketingModule(sym_gen, train_iter.default_bucket_key, context=devs)
     module.fit(train_data=train_iter,
                eval_data=val_iter,
@@ -225,6 +231,8 @@ def train(train_iter, val_iter):
     train_iter, val_iter, word_to_index, char_to_index, entity_to_index = build_iters(args.data_dir, args.max_records,
                                                                      args.train_fraction, args.batch_size, args.buckets)
 
+    logging.info("validation iterator: %s", val_iter)
+
     # Define the recurrent layer
     bi_cell = mx.rnn.SequentialRNNCell()
     for layer_num in range(args.lstm_layers):

@@ -19,6 +19,7 @@
 
 # -*- coding: utf-8 -*-
 
+import logging
 import pandas as pd
 import numpy as np
 
@@ -45,6 +46,12 @@
 
 #join the results on utterance id
 df = df1.merge(df2.merge(df3, how = "left", on = "utterance_id"), how = "left", on = "utterance_id")
+pd.option_context('display.max_colwidth', None)
+pd.option_context('display.max_rowwidth', None)
+
+logging.info("preprocess: 1st sentence:")
+logging.info(df['token'].iloc[0].tolist())
+logging.info(df['BILOU_tag'].iloc[0].tolist())
 
 #save the dataframe to a csv file
 df.to_pickle("../data/ner_data.pkl")