generate_dataset.py

# -*- coding: utf-8 -*-
"""pubrec-generate-dataset.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1yowkK19M7YGLZTbjX7g4dTEaQphvp0a_
"""

# Commented out IPython magic to ensure Python compatibility.
from google.colab import drive
drive.mount('/gdrive')
# %cd /gdrive

# Commented out IPython magic to ensure Python compatibility.
# %cd My\ Drive/pubrec

import torch
import torch.utils.data as torch_data

import time
import csv
import json as js
import os
import codecs
import pickle as p

import numpy as np

import nltk
nltk.download('punkt')

PAD = 0
UNK = 1
BOS = 2
EOS = 3

PAD_WORD = '<blank>' 
UNK_WORD = 'UNK'
BOS_WORD = '<s>'
EOS_WORD = '</s>'
SPA_WORD = ' '

def flatten(l): 
    for el in l:
        if hasattr(el, "__iter__"):
            for sub in flatten(el):
                yield sub
        else:
            yield el

class Dict(object):
    def __init__(self, data=None, lower=False):
        self.idxToLabel = {}
        self.labelToIdx = {}
        self.frequencies = {}
        self.lower = lower
        # Special entries will not be pruned.
        self.special = [] 

        if data is not None:
            if type(data) == str:
                self.loadFile(data)
            else:
                self.addSpecials(data)

    def size(self):
        return len(self.idxToLabel)

    # Load entries from a file.
    def loadFile(self, filename):
        for line in open(filename):
            fields = line.split()
            label = ' '.join(fields[:-1])
            idx = int(fields[-1])
            self.add(label, idx)

    # Write entries to a file.
    def writeFile(self, filename):
        with open(filename, 'w') as file:
            for i in range(self.size()):
                label = self.idxToLabel[i]
                file.write('%s %d\n' % (label, i))

        file.close()

    def loadDict(self, idxToLabel):
        for i in range(len(idxToLabel)):
            label = idxToLabel[i]
            self.add(label, i)

    def lookup(self, key, default=None):
        key = key.lower() if self.lower else key
        try:
            return self.labelToIdx[key]
        except KeyError:
            return default

    def getLabel(self, idx, default=None):
        try:
            return self.idxToLabel[idx]
        except KeyError:
            return default

    # Mark this `label` and `idx` as special (i.e. will not be pruned).
    def addSpecial(self, label, idx=None):
        idx = self.add(label, idx)
        self.special += [idx]

    # Mark all labels in `labels` as specials (i.e. will not be pruned).
    def addSpecials(self, labels):
        for label in labels:
            self.addSpecial(label)

    # Add `label` in the dictionary. Use `idx` as its index if given.
    def add(self, label, idx=None):
        label = label.lower() if self.lower else label
        if idx is not None:
            self.idxToLabel[idx] = label
            self.labelToIdx[label] = idx
        else:
            if label in self.labelToIdx:
                idx = self.labelToIdx[label]
            else:
                idx = len(self.idxToLabel)
                self.idxToLabel[idx] = label
                self.labelToIdx[label] = idx

        if idx not in self.frequencies:
            self.frequencies[idx] = 1
        else:
            self.frequencies[idx] += 1

        return idx

    # Return a new dictionary with the `size` most frequent entries.
    def prune(self, size):
        if size >= self.size():
            return self

        # Only keep the `size` most frequent entries.
        freq = torch.Tensor(
                [self.frequencies[i] for i in range(len(self.frequencies))])
        _, idx = torch.sort(freq, 0, True)
        newDict = Dict()
        newDict.lower = self.lower

        # Add special entries in all cases.
        for i in self.special:
            newDict.addSpecial(self.idxToLabel[i])

        for i in idx[:size]:
            newDict.add(self.idxToLabel[i.item()])

        return newDict

    # Convert `labels` to indices. Use `unkWord` if not found.
    # Optionally insert `bosWord` at the beginning and `eosWord` at the .
    def convertToIdx(self, labels, unkWord, bosWord=None, eosWord=None):
        vec = []

        if bosWord is not None:
            vec += [self.lookup(bosWord)]

        unk = self.lookup(unkWord)
        vec += [self.lookup(label, default=unk) for label in labels]

        if eosWord is not None:
            vec += [self.lookup(eosWord)]

        vec = [x for x in flatten(vec)]

        return torch.LongTensor(vec)

    # Convert `idx` to labels. If index `stop` is reached, convert it and return.
    def convertToLabels(self, idx, stop):
        labels = []

        for i in idx:
            if i == stop:
                break
            labels += [self.getLabel(i)]

        return labels

class AttrDict(dict):

    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

def read_config(path):
    return AttrDict(js.load(open(path, 'r')))


def format_time(t):
    return time.strftime("%Y-%m-%d-%H:%M:%S", t)


def logging(file):
    def write_log(s):
        print(s, end='')
        with open(file, 'a') as f:
            f.write(s)
    return write_log


def logging_csv(file):
    def write_csv(s):
        with open(file, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(s)
    return write_csv

class dataset(torch_data.Dataset):

    def __init__(self, text_data, label_data):
        self.text_data = text_data
        self.label_data = label_data
  
    def __getitem__(self, index):
        return [torch.from_numpy(x[index]).type(torch.FloatTensor) for x in self.text_data],\
               torch.from_numpy(self.label_data[index]).type(torch.FloatTensor)

    def __len__(self):
        return len(self.label_data)
       

def get_loader(dataset, batch_size, shuffle, num_workers):

    data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers)
    return data_loader

def makeVocabulary(filename, size, sep=' ', char=False):

    vocab = Dict([PAD_WORD, UNK_WORD], lower=True)
    if char:
        vocab.addSpecial(SPA_WORD)

    lengths = []

    if type(filename) == list:
        for _filename in filename:
            data = p.load(open(_filename,"rb"))
            for sent in data:
                for word in sent.strip().split(sep):
                    lengths.append(len(word))
                    if char:
                        for ch in word.strip():
                            vocab.add(ch)
                    else:
                        vocab.add(word.strip())
    else:
        data = p.load(open(filename,"rb"))
        for sent in data:
            for word in sent.strip().split(sep):
                lengths.append(len(word))
                if char:
                    for ch in word.strip():
                        vocab.add(ch)
                else:
                    vocab.add(word.strip())
    print('max: %d, min: %d, avg: %.2f' % (max(lengths), min(lengths), sum(lengths)/len(lengths)))

    originalSize = vocab.size()
    vocab = vocab.prune(size)  
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab

def initVocabulary(name, dataFile, vocabFile, vocabSize, sep=' ', char=False):

    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = Dict()
        vocab.loadFile(vocabFile)  
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building ' + name + ' vocabulary...')
        genWordVocab = makeVocabulary(dataFile, vocabSize, sep=sep, char=char)  
        vocab = genWordVocab

    return vocab


def saveVocabulary(name, vocab, file):

    print('Saving ' + name + ' vocabulary to \'' + file + '\'...')
    vocab.writeFile(file)

dicts = {}
dicts['text'] = initVocabulary('text', None, './data/text_dict', 50000, ' ', False)
dicts['authors'] = initVocabulary('authors', None, './data/authors_dict', 20000, ' ', False)

# saveVocabulary('text', dicts['text'], './data/text_dict')
# saveVocabulary('authors', dicts['authors'], './data/authors_dict')

# pvs=p.load(open("./pv_train.p","rb"))
# unique_pvs=np.unique(np.array(pvs))
# dicts['pvs']=Dict()
# for pv in unique_pvs:
#   dicts['pvs'].add(pv)
# dicts['pvs'] = dicts['pvs'].prune(300)
# saveVocabulary('pvs', dicts['pvs'], './data/pvs_dict')

dicts['pvs'] = initVocabulary('pvs', None, './data/pvs_dict', 50000, ' ', False)

abstract = {'text_file': './abstract_train.p', 'text_dict': dicts['text'], 'doc_len': 8, 'text_len': 20}
title = {'text_file': './title_train.p', 'text_dict': dicts['text'], 'text_len': 20}
authors = {'text_file': './authors_train.p', 'text_dict': dicts['authors'], 'text_len': 7}
pv = {'pv_file': './pv_train.p', 'pv_dict': dicts['pvs']}

abstract_val = {'text_file': './abstract_val.p', 'text_dict': dicts['text'], 'doc_len': 8, 'text_len': 20}
title_val = {'text_file': './title_val.p', 'text_dict': dicts['text'], 'text_len': 20}
authors_val = {'text_file': './authors_val.p', 'text_dict': dicts['authors'], 'text_len': 7}
pv_val = {'pv_file': './pv_val.p', 'pv_dict': dicts['pvs']}

abstract_test = {'text_file': './abstract_test.p', 'text_dict': dicts['text'], 'doc_len': 8, 'text_len': 20}
title_test = {'text_file': './title_test.p', 'text_dict': dicts['text'], 'text_len': 20}
authors_test = {'text_file': './authors_test.p', 'text_dict': dicts['authors'], 'text_len': 7}
pv_test = {'pv_file': './pv_test.p', 'pv_dict': dicts['pvs']}

def make_data(abstract, title, authors, pv):

    text_data = []
    text_data.append(make_abstract_data(abstract['text_file'], abstract['text_dict'], abstract['doc_len'], abstract['text_len']))
    text_data.append(make_title_data(title['text_file'], title['text_dict'], title['text_len']))
    text_data.append(make_author_data(authors['text_file'], authors['text_dict'], authors['text_len'], sep=';'))
    pv_data = make_pv_data(pv['pv_file'], pv['pv_dict'])

    return dataset(text_data, pv_data)

def make_abstract_data(text_file, text_dict, doc_length, text_length, sep=' '):
    result = []
    data=p.load(open(text_file,"rb"))
    for line in data:
        temp = np.zeros((doc_length, text_length))
        sents=nltk.sent_tokenize(line)
        for i in range(len(sents)):
            if i < doc_length:
                words = nltk.word_tokenize(sents[i].strip())
                for j in range(len(words)):
                    if j < text_length:
                        temp[i, j] = text_dict.lookup(words[j].lower(), 1)
        result.append(temp)
    return result

def make_title_data(text_file, text_dict, text_length, sep=' '):
    result = []
    data=p.load(open(text_file,"rb"))
    for line in data:
        temp = np.zeros(text_length)
        words = nltk.word_tokenize(line.strip())
        for i in range(len(words)):
            if i < text_length:
                temp[i] = text_dict.lookup(words[i].lower(), 1)
        result.append(temp)
    return result

def make_author_data(text_file, text_dict, text_length, sep=' '):
    result = []
    data=p.load(open(text_file,"rb"))
    for line in data:
        temp = np.zeros(text_length)
        words = line.strip().split(sep)
        for i in range(len(words)):
            if i < text_length:
                temp[i] = text_dict.lookup(words[i].lower(), 1)
        result.append(temp)
    return result

def make_pv_data(pv_file, pv_dict):
    result = []
    length = len(pv_dict.idxToLabel)
    data=p.load(open(pv_file,"rb"))
    for line in data:
        temp = np.zeros(length)
        temp[pv_dict.lookup(str(line), 1)] = 1
        result.append(temp)
    return result

train = make_data(abstract, title, authors, pv)
val = make_data(abstract_val, title_val, authors_val, pv_val)
test = make_data(abstract_test, title_test, authors_test, pv_test)

data = {'train': train, 'val': val, 'test': test}
torch.save(data, './data/final_data_3')