diff --git a/example/named_entity_recognition/src/metrics.py b/example/named_entity_recognition/src/metrics.py index ef5f64fb1af3..a1d270af6863 100644 --- a/example/named_entity_recognition/src/metrics.py +++ b/example/named_entity_recognition/src/metrics.py @@ -19,6 +19,7 @@ # -*- coding: utf-8 -*- +import logging import mxnet as mx import numpy as np import pickle @@ -43,17 +44,17 @@ def classifer_metrics(label, pred): corr_pred = (prediction == label) == (pred_is_entity == True) #how many entities are there? - num_entities = np.sum(label_is_entity) - entity_preds = np.sum(pred_is_entity) - + # better to cast to float for safer further ratio computations + num_entities = float(np.sum(label_is_entity)) + entity_preds = float(np.sum(pred_is_entity)) #how many times did we correctly predict an entity? - correct_entitites = np.sum(corr_pred[pred_is_entity]) + correct_entitites = float(np.sum(corr_pred[pred_is_entity])) #precision: when we predict entity, how often are we right? if entity_preds == 0: precision = np.nan else: - precision = correct_entitites/entity_preds + precision = correct_entitites / entity_preds #recall: of the things that were an entity, how many did we catch? recall = correct_entitites / num_entities @@ -64,6 +65,8 @@ def classifer_metrics(label, pred): f1 = 0 else: f1 = 2 * precision * recall / (precision + recall) + + logging.debug("Metrics results: precision=%f recall=%f f1=%f", precision, recall, f1) return precision, recall, f1 def entity_precision(label, pred): diff --git a/example/named_entity_recognition/src/ner.py b/example/named_entity_recognition/src/ner.py index 7f5dd84527cc..6accb2826f2d 100644 --- a/example/named_entity_recognition/src/ner.py +++ b/example/named_entity_recognition/src/ner.py @@ -93,6 +93,7 @@ def build_vocab(nested_list): """ # Build vocabulary word_counts = Counter(itertools.chain(*nested_list)) + logging.info("build_vocab: word_counts=%d" % (len(word_counts))) # Mapping from index to label vocabulary_inv = [x[0] for x in word_counts.most_common()] @@ -114,6 +115,7 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None) :param buckets: size of each bucket in the iterators :return: train_iter, val_iter, word_to_index, index_to_word, pos_to_index, index_to_pos """ + # Read in data as numpy array df = pd.read_pickle(os.path.join(data_dir, "ner_data.pkl"))[:max_records] @@ -135,12 +137,14 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None) # Split into training and testing data idx=int(len(indexed_tokens)*train_fraction) + logging.info("Preparing train/test datasets splitting at idx %d on total %d sentences using a batchsize of %d", idx, len(indexed_tokens), batch_size) X_token_train, X_char_train, Y_train = indexed_tokens[:idx], indexed_chars[:idx], indexed_entities[:idx] X_token_test, X_char_test, Y_test = indexed_tokens[idx:], indexed_chars[idx:], indexed_entities[idx:] # build iterators to feed batches to network train_iter = iterators.BucketNerIter(sentences=X_token_train, characters=X_char_train, label=Y_train, max_token_chars=5, batch_size=batch_size, buckets=buckets) + logging.info("Creating the val_iter using %d sentences", len(X_token_test)) val_iter = iterators.BucketNerIter(sentences=X_token_test, characters=X_char_test, label=Y_test, max_token_chars=train_iter.max_token_chars, batch_size=batch_size, buckets=train_iter.buckets) return train_iter, val_iter, word_to_index, char_to_index, entity_to_index @@ -205,6 +209,8 @@ def sym_gen(seq_len): def train(train_iter, val_iter): import metrics devs = mx.cpu() if args.gpus is None or args.gpus is '' else [mx.gpu(int(i)) for i in args.gpus.split(',')] + logging.info("train on device %s using optimizer %s at learningrate %f for %d epochs using %d records: lstm_state_size=%d ...", + devs, args.optimizer, args.lr, args.num_epochs, args.max_records, args.lstm_state_size) module = mx.mod.BucketingModule(sym_gen, train_iter.default_bucket_key, context=devs) module.fit(train_data=train_iter, eval_data=val_iter, @@ -225,6 +231,8 @@ def train(train_iter, val_iter): train_iter, val_iter, word_to_index, char_to_index, entity_to_index = build_iters(args.data_dir, args.max_records, args.train_fraction, args.batch_size, args.buckets) + logging.info("validation iterator: %s", val_iter) + # Define the recurrent layer bi_cell = mx.rnn.SequentialRNNCell() for layer_num in range(args.lstm_layers): diff --git a/example/named_entity_recognition/src/preprocess.py b/example/named_entity_recognition/src/preprocess.py index 6ae348ad8bae..22c1c360c62d 100644 --- a/example/named_entity_recognition/src/preprocess.py +++ b/example/named_entity_recognition/src/preprocess.py @@ -19,6 +19,7 @@ # -*- coding: utf-8 -*- +import logging import pandas as pd import numpy as np @@ -45,6 +46,12 @@ #join the results on utterance id df = df1.merge(df2.merge(df3, how = "left", on = "utterance_id"), how = "left", on = "utterance_id") +pd.option_context('display.max_colwidth', None) +pd.option_context('display.max_rowwidth', None) + +logging.info("preprocess: 1st sentence:") +logging.info(df['token'].iloc[0].tolist()) +logging.info(df['BILOU_tag'].iloc[0].tolist()) #save the dataframe to a csv file df.to_pickle("../data/ner_data.pkl") \ No newline at end of file