gen_sent140_vecs.py

"""
Script to generate sparse bag-of-word vectors for the Sent140 dataset, to be 
used after downloading and preprocessing data from LEAF. See README.
"""
import pickle
import numpy as np
import json
from collections import defaultdict
from progressbar import progressbar
from scipy.sparse import coo_matrix


def generate_bags(user_x, user_y, VOCAB_SIZE, vocab):
    """
    Create sparse bag-of-word vectors and targets from text samples.
    
    Args:
    - user_x:     {list} of strings 
    - user_y:     {list} if integers
    - VOCAB_SIZE: {int} size of bag-of-words vectors
    - vocab:      {dict} mapping from word token to index
    
    Returns:
    - bags: {coo_matrix} of bag 
    - y:    {np.array} of targets
    """
    y_numpy = np.array(user_y, dtype=np.int32)
    bags    = np.zeros((len(user_x), VOCAB_SIZE), dtype=np.float32)
    
    # split posts and map tokens to indexes
    for (i, post) in enumerate(user_x):
        words = post[4].split(' ')
        for word in words:
            if word in vocab:
                bags[i, vocab[word]] += 1
    
    # discard any posts without any tokens in to top VOCAB_SIZE most frequent
    total_words_in_vocab = np.sum(bags, axis=1)
    bad_samples          = np.where(total_words_in_vocab == 0)[0]
    
    bags    = np.delete(bags, bad_samples, axis=0)
    y_numpy = np.delete(y_numpy, bad_samples)
    total_words_in_vocab = np.delete(total_words_in_vocab, bad_samples)
    bags = bags / total_words_in_vocab[:,None]
    
    return coo_matrix(bags), y_numpy


def main():
    VOCAB_SIZE  = 5000   # size of bag-of-words vector
    train_fname = 'all_data_niid_0_keep_10_train_8.json'    # generated by LEAF
    test_fname  = 'all_data_niid_0_keep_10_test_8.json'     # generated by LEAF

    # path to training data file generated by LEAF tool
    with open(train_fname, 'r') as f:
        train = json.load(f)['user_data']
    users = train.keys()

    # path to test data file generated by LEAF tool
    with open(test_fname, 'r') as f:
        test = json.load(f)['user_data']

    # calculate the frequency of each word token in the entire dataset
    print('Calculating word frequencies...')
    words_freqs = defaultdict(int)
    for user in progressbar(users):
        for x in train[user]['x']:
            words = x[4].split(' ')
            for word in words:
                words_freqs[word] += 1

    # order the words by total usage to get the top VOCAB_SIZE tokens
    sorted_word_freqs = sorted(words_freqs.items(), key=lambda kv: kv[1], reverse=True)
    vocab = sorted_word_freqs[:VOCAB_SIZE]
    vocab = dict([(v[0], i) for (i, v) in enumerate(vocab)])

    # gen and save training vectors as sparse arrays
    print('Generating training bag-of-words...')
    train_x = []
    train_y = []
    for user in progressbar(users):
        bags, targets = generate_bags(  train[user]['x'], 
                                        train[user]['y'], 
                                        VOCAB_SIZE, 
                                        vocab)
        train_x.append(bags)
        train_y.append(targets)

    with open('./datasets/sent140/train_data_sparse_10k.pkl', 'wb') as f:
        pickle.dump((train_x, train_y), f)


    # gen and save training vectors as sparse arrays
    print('Generating testing bag-of-words...')
    test_x = []
    test_y = []
    for user in progressbar(users):
        bags, targets = generate_bags(  test[user]['x'], 
                                        test[user]['y'],
                                        VOCAB_SIZE, 
                                        vocab)   
        test_x.append(bags)
        test_y.append(targets)
        
    with open('./datasets/sent140/test_data_sparse_10k.pkl', 'wb') as f:
        pickle.dump((test_x, test_y), f)


if __name__ == '__main__':
    main()