general_utils.py

import pickle
import sys
import time

import numpy as np
from nltk import Tree


def to_nltk_tree(node, tokens):
    if len(node.left_children) + len(node.right_children) > 0:
        return Tree(node.word, [
            to_nltk_tree(tokens[child], tokens)
            for child in (node.left_children + node.right_children)
        ])
    else:
        return node.word


def compute_dependencies(data, dataset):
    sentences = data
    rem_sentences = [sentence for sentence in sentences]
    [sentence.clear_prediction_dependencies() for sentence in sentences]
    [sentence.clear_children_info() for sentence in sentences]

    while len(rem_sentences) != 0:
        curr_batch_size = min(dataset.model_config.batch_size,
                              len(rem_sentences))
        batch_sentences = rem_sentences[:curr_batch_size]

        enable_features = [
            0 if len(sentence.stack) == 1 and len(sentence.buff) == 0 else 1
            for sentence in batch_sentences
        ]
        enable_count = np.count_nonzero(enable_features)

        while enable_count > 0:
            curr_sentences = [
                sentence for i, sentence in enumerate(batch_sentences)
                if enable_features[i] == 1
            ]

            # get feature for each sentence
            # call predictions -> argmax
            # store dependency and left/right child
            # update state
            # repeat

            curr_inputs = [
                dataset.feature_extractor.extract_for_current_state(
                    sentence, dataset.word2idx, dataset.pos2idx,
                    dataset.dep2idx) for sentence in curr_sentences
            ]
            word_inputs_batch = [
                curr_inputs[i][0] for i in range(len(curr_inputs))
            ]
            pos_inputs_batch = [
                curr_inputs[i][1] for i in range(len(curr_inputs))
            ]
            dep_inputs_batch = [
                curr_inputs[i][2] for i in range(len(curr_inputs))
            ]

            word_index = word_inputs_batch,
            pos_index = pos_inputs_batch,
            dep_index = dep_inputs_batch,
            dropout_index = 1

            # These are the raw outputs, which represent the activations for
            # prediction over valid transitions
            predictions = forward(word_index, pos_index, dep_index)
            print('predictions: ', predictions.size())

            legal_labels = np.asarray(
                [sentence.get_legal_labels() for sentence in curr_sentences],
                dtype=np.float32)
            legal_transitions = np.argmax(
                predictions + 1000 * legal_labels, axis=1)

            # update left/right children so can be used for next feature vector
            [
                sentence.update_child_dependencies(transition)
                for (sentence,
                     transition) in zip(curr_sentences, legal_transitions)
                if transition != 2
            ]

            # update state
            [
                sentence.update_state_by_transition(
                    legal_transition, gold=False)
                for (
                    sentence,
                    legal_transition) in zip(curr_sentences, legal_transitions)
            ]

            enable_features = [
                0
                if len(sentence.stack) == 1 and len(sentence.buff) == 0 else 1
                for sentence in batch_sentences
            ]
            enable_count = np.count_nonzero(enable_features)

        # Reset stack and buffer
        [sentence.reset_to_initial_state() for sentence in batch_sentences]
        rem_sentences = rem_sentences[curr_batch_size:]


def get_minibatches(data,
                    minibatch_size,
                    shuffle=True,
                    is_multi_feature_input=False):
    """
    Iterates through the provided data one minibatch at at time. You can use this function to
    iterate through data in minibatches as follows:
        for inputs_minibatch in get_minibatches(inputs, minibatch_size):
            ...
    Or with multiple data sources:
        for inputs_minibatch, labels_minibatch in get_minibatches([inputs, labels], minibatch_size):
            ...
    Args:
        data: there are two possible values:
            - a list or numpy array
            - a list where each element is either a list or numpy array
        minibatch_size: the maximum number of items in a minibatch
        shuffle: whether to randomize the order of returned data
        is_multi_feature_input: True if multiple type features are present ex. (word, pos, label)
    Returns:
        minibatches: the return value depends on data:
            - If data is a list/array it yields the next minibatch of data.
            - If data a list of lists/arrays it returns the next minibatch of each element in the
              list. This can be used to iterate through multiple data sources
              (e.g., features and labels) at the same time.
    """
    if is_multi_feature_input:
        list_data = type(data) is list and (type(data[0][0]) is list
                                            or type(data[0][0]) is np.ndarray)
    else:
        list_data = type(data) is list and (type(data[0]) is list
                                            or type(data[0]) is np.ndarray)
    # data_size = len(data[0]) if list_data else len(data)
    data_size = len(data[0]) if not is_multi_feature_input else len(data[0][0])
    indices = np.arange(data_size)
    if shuffle:
        np.random.shuffle(indices)
    for minibatch_start in np.arange(0, data_size, minibatch_size):
        minibatch_indices = indices[minibatch_start:
                                    minibatch_start + minibatch_size]
        if is_multi_feature_input:
            yield [[minibatch(data[0][i], minibatch_indices) for i in range(len(data[0]))],
                   minibatch(data[1], minibatch_indices)] if list_data \
                else [minibatch(data[0][i], minibatch_indices) for i in range(len(data[0]))]
        else:
            yield [minibatch(d, minibatch_indices) for d in data] if list_data \
                else minibatch(data, minibatch_indices)


def minibatch(data, minibatch_idx):
    return data[minibatch_idx] if type(data) is np.ndarray else [
        data[i] for i in minibatch_idx
    ]


def test_all_close(name, actual, expected):
    if actual.shape != expected.shape:
        raise ValueError(
            "{:} failed, expected output to have shape {:} but has shape {:}"
            .format(name, expected.shape, actual.shape))
    if np.amax(np.fabs(actual - expected)) > 1e-6:
        raise ValueError("{:} failed, expected {:} but value is {:}".format(
            name, expected, actual))
    else:
        print(name, "passed!")


def get_pickle(path):
    data = pickle.load(open(path, "rb"))
    return data


def dump_pickle(data, path):
    with open(path, "wb") as f:
        pickle.dump(data, f)


def get_vocab_dict(items):
    item2idx = {}
    idx = 0
    for item in items:
        item2idx[item] = idx
        idx += 1
    return item2idx


def logged_loop(iterable, n=None):
    if n is None:
        n = len(iterable)
    step = max(1, n / 1000)
    prog = Progbar(n)
    for i, elem in enumerate(iterable):
        if i % step == 0 or i == n - 1:
            prog.update(i + 1)
        yield elem


class Progbar(object):
    """
    Progbar class copied from keras (https://github.com/fchollet/keras/)
    Displays a progress bar.
    # Arguments
        target: Total number of steps expected.
        interval: Minimum visual progress update interval (in seconds).
    """

    def __init__(self, target, width=30, verbose=1):
        self.width = width
        self.target = target
        self.sum_values = {}
        self.unique_values = []
        self.start = time.time()
        self.total_width = 0
        self.seen_so_far = 0
        self.verbose = verbose

    def update(self, current, values=[], exact=[]):
        """
        Updates the progress bar.
        # Arguments
            current: Index of current step.
            values: List of tuples (name, value_for_last_step).
                The progress bar will display averages for these values.
            exact: List of tuples (name, value_for_last_step).
                The progress bar will display these values directly.
        """

        for k, v in values:
            if k not in self.sum_values:
                self.sum_values[k] = [
                    v * (current - self.seen_so_far),
                    current - self.seen_so_far
                ]
                self.unique_values.append(k)
            else:
                self.sum_values[k][0] += v * (current - self.seen_so_far)
                self.sum_values[k][1] += (current - self.seen_so_far)
        for k, v in exact:
            if k not in self.sum_values:
                self.unique_values.append(k)
            self.sum_values[k] = [v, 1]
        self.seen_so_far = current

        now = time.time()
        if self.verbose == 1:
            prev_total_width = self.total_width
            sys.stdout.write("\b" * prev_total_width)
            sys.stdout.write("\r")

            numdigits = int(np.floor(np.log10(self.target))) + 1
            barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
            bar = barstr % (current, self.target)
            prog = float(current) / self.target
            prog_width = int(self.width * prog)
            if prog_width > 0:
                bar += ('=' * (prog_width - 1))
                if current < self.target:
                    bar += '>'
                else:
                    bar += '='
            bar += ('.' * (self.width - prog_width))
            bar += ']'
            sys.stdout.write(bar)
            self.total_width = len(bar)

            if current:
                time_per_unit = (now - self.start) / current
            else:
                time_per_unit = 0
            eta = time_per_unit * (self.target - current)
            info = ''
            if current < self.target:
                info += ' - ETA: %ds' % eta
            else:
                info += ' - %ds' % (now - self.start)
            for k in self.unique_values:
                if type(self.sum_values[k]) is list:
                    info += ' - %s: %.4f' % (
                        k,
                        self.sum_values[k][0] / max(1, self.sum_values[k][1]))
                else:
                    info += ' - %s: %s' % (k, self.sum_values[k])

            self.total_width += len(info)
            if prev_total_width > self.total_width:
                info += ((prev_total_width - self.total_width) * " ")

            sys.stdout.write(info)
            sys.stdout.flush()

            if current >= self.target:
                sys.stdout.write("\n")

        if self.verbose == 2:
            if current >= self.target:
                info = '%ds' % (now - self.start)
                for k in self.unique_values:
                    info += ' - %s: %.4f' % (
                        k,
                        self.sum_values[k][0] / max(1, self.sum_values[k][1]))
                sys.stdout.write(info + "\n")

    def add(self, n, values=[]):
        self.update(self.seen_so_far + n, values)


def make_embedding_to_pkl():
    senna_file = ("/home/asjindal/Work/Retraining/en-senna-50.txt",
                  "senna.50d_dict.pkl")
    glove_50d_file = ("/home/asjindal/data/glove/glove.6B.50d.txt",
                      "glove.6B.50d_dict.pkl")
    glove_100d_file = ("/home/asjindal/data/glove/glove.6B.100d.txt",
                       "glove.6B.100d_dict.pkl")
    glove_300d_file = ("/home/asjindal/Downloads/glove.42B.300d.txt",
                       "glove.42B.300d_dict.pkl")

    all_files = [senna_file, glove_50d_file, glove_100d_file, glove_300d_file]

    word_vectors = {}
    for file in all_files:
        lines = open(file[0], "r").readlines()
        if "\t" in lines[0].strip():
            delim = "\t"
        else:
            delim = " "
        for line in lines:
            sp = line.strip().split(delim)
            """
            if delim == " ":
                word_vectors[sp[0]] = np.array([float(x) for x in sp[1:]])
            else:
                word_vectors[sp[0]] = np.array([float(x) for x in sp[1].split()])
            """

            if delim == " ":
                word_vectors[sp[0]] = " ".join(sp[1:]).strip()
            else:
                word_vectors[sp[0]] = sp[1].strip()

        print("Loaded!")
        dump_pickle(word_vectors,
                    "/home/asjindal/data/embeddings_pkl/" + file[1])
        print("Done!")