Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
logging (#15106)
Browse files Browse the repository at this point in the history
  • Loading branch information
WilliamTambellini authored and szha committed Jul 21, 2019
1 parent d599bc3 commit 9d859c8
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 5 deletions.
13 changes: 8 additions & 5 deletions example/named_entity_recognition/src/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

# -*- coding: utf-8 -*-

import logging
import mxnet as mx
import numpy as np
import pickle
Expand All @@ -43,17 +44,17 @@ def classifer_metrics(label, pred):
corr_pred = (prediction == label) == (pred_is_entity == True)

#how many entities are there?
num_entities = np.sum(label_is_entity)
entity_preds = np.sum(pred_is_entity)

# better to cast to float for safer further ratio computations
num_entities = float(np.sum(label_is_entity))
entity_preds = float(np.sum(pred_is_entity))
#how many times did we correctly predict an entity?
correct_entitites = np.sum(corr_pred[pred_is_entity])
correct_entitites = float(np.sum(corr_pred[pred_is_entity]))

#precision: when we predict entity, how often are we right?
if entity_preds == 0:
precision = np.nan
else:
precision = correct_entitites/entity_preds
precision = correct_entitites / entity_preds

#recall: of the things that were an entity, how many did we catch?
recall = correct_entitites / num_entities
Expand All @@ -64,6 +65,8 @@ def classifer_metrics(label, pred):
f1 = 0
else:
f1 = 2 * precision * recall / (precision + recall)

logging.debug("Metrics results: precision=%f recall=%f f1=%f", precision, recall, f1)
return precision, recall, f1

def entity_precision(label, pred):
Expand Down
8 changes: 8 additions & 0 deletions example/named_entity_recognition/src/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def build_vocab(nested_list):
"""
# Build vocabulary
word_counts = Counter(itertools.chain(*nested_list))
logging.info("build_vocab: word_counts=%d" % (len(word_counts)))

# Mapping from index to label
vocabulary_inv = [x[0] for x in word_counts.most_common()]
Expand All @@ -114,6 +115,7 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None)
:param buckets: size of each bucket in the iterators
:return: train_iter, val_iter, word_to_index, index_to_word, pos_to_index, index_to_pos
"""

# Read in data as numpy array
df = pd.read_pickle(os.path.join(data_dir, "ner_data.pkl"))[:max_records]

Expand All @@ -135,12 +137,14 @@ def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None)

# Split into training and testing data
idx=int(len(indexed_tokens)*train_fraction)
logging.info("Preparing train/test datasets splitting at idx %d on total %d sentences using a batchsize of %d", idx, len(indexed_tokens), batch_size)
X_token_train, X_char_train, Y_train = indexed_tokens[:idx], indexed_chars[:idx], indexed_entities[:idx]
X_token_test, X_char_test, Y_test = indexed_tokens[idx:], indexed_chars[idx:], indexed_entities[idx:]

# build iterators to feed batches to network
train_iter = iterators.BucketNerIter(sentences=X_token_train, characters=X_char_train, label=Y_train,
max_token_chars=5, batch_size=batch_size, buckets=buckets)
logging.info("Creating the val_iter using %d sentences", len(X_token_test))
val_iter = iterators.BucketNerIter(sentences=X_token_test, characters=X_char_test, label=Y_test,
max_token_chars=train_iter.max_token_chars, batch_size=batch_size, buckets=train_iter.buckets)
return train_iter, val_iter, word_to_index, char_to_index, entity_to_index
Expand Down Expand Up @@ -205,6 +209,8 @@ def sym_gen(seq_len):
def train(train_iter, val_iter):
import metrics
devs = mx.cpu() if args.gpus is None or args.gpus is '' else [mx.gpu(int(i)) for i in args.gpus.split(',')]
logging.info("train on device %s using optimizer %s at learningrate %f for %d epochs using %d records: lstm_state_size=%d ...",
devs, args.optimizer, args.lr, args.num_epochs, args.max_records, args.lstm_state_size)
module = mx.mod.BucketingModule(sym_gen, train_iter.default_bucket_key, context=devs)
module.fit(train_data=train_iter,
eval_data=val_iter,
Expand All @@ -225,6 +231,8 @@ def train(train_iter, val_iter):
train_iter, val_iter, word_to_index, char_to_index, entity_to_index = build_iters(args.data_dir, args.max_records,
args.train_fraction, args.batch_size, args.buckets)

logging.info("validation iterator: %s", val_iter)

# Define the recurrent layer
bi_cell = mx.rnn.SequentialRNNCell()
for layer_num in range(args.lstm_layers):
Expand Down
7 changes: 7 additions & 0 deletions example/named_entity_recognition/src/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

# -*- coding: utf-8 -*-

import logging
import pandas as pd
import numpy as np

Expand All @@ -45,6 +46,12 @@

#join the results on utterance id
df = df1.merge(df2.merge(df3, how = "left", on = "utterance_id"), how = "left", on = "utterance_id")
pd.option_context('display.max_colwidth', None)
pd.option_context('display.max_rowwidth', None)

logging.info("preprocess: 1st sentence:")
logging.info(df['token'].iloc[0].tolist())
logging.info(df['BILOU_tag'].iloc[0].tolist())

#save the dataframe to a csv file
df.to_pickle("../data/ner_data.pkl")

0 comments on commit 9d859c8

Please sign in to comment.