data_preprocessing_seq_to_one.R

# download the IMDB dataset
if (!file.exists("data/aclImdb_v1.tar.gz")) {
  download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
                "data/aclImdb_v1.tar.gz")
  untar("data/aclImdb_v1.tar.gz", exdir = "data/")
}

# install required packages
list.of.packages <- c("readr", "dplyr", "stringr", "stringi")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
if (length(new.packages)) install.packages(new.packages)

require("readr")
require("dplyr")
require("stringr")
require("stringi")

negative_train_list <- list.files("data/aclImdb/train/neg/", full.names = T)
positive_train_list <- list.files("data/aclImdb/train/pos/", full.names = T)

negative_test_list <- list.files("data/aclImdb/test/neg/", full.names = T)
positive_test_list <- list.files("data/aclImdb/test/pos/", full.names = T)

file_import <- function(file_list) {
  import <- sapply(file_list, read_file)
  return(import)
}

negative_train_raw <- file_import(negative_train_list)
positive_train_raw <- file_import(positive_train_list)

negative_test_raw <- file_import(negative_test_list)
positive_test_raw <- file_import(positive_test_list)

train_raw <- c(negative_train_raw, positive_train_raw)
test_raw <- c(negative_test_raw, positive_test_raw)

# Pre-process a corpus composed of a vector of sequences Build a dictionnary
# removing too rare words
text_pre_process <- function(corpus, count_threshold = 10, dic = NULL) {
  raw_vec <- corpus
  raw_vec <- stri_enc_toascii(str = raw_vec)
  
  ### perform some preprocessing
  raw_vec <- str_replace_all(string = raw_vec, pattern = "[^[:print:]]", replacement = "")
  raw_vec <- str_to_lower(string = raw_vec)
  raw_vec <- str_replace_all(string = raw_vec, pattern = "_", replacement = " ")
  raw_vec <- str_replace_all(string = raw_vec, pattern = "\\bbr\\b", replacement = "")
  raw_vec <- str_replace_all(string = raw_vec, pattern = "\\s+", replacement = " ")
  raw_vec <- str_trim(string = raw_vec)
  
  ### Split raw sequence vectors into lists of word vectors (one list element per
  ### sequence)
  word_vec_list <- stri_split_boundaries(raw_vec, type = "word", skip_word_none = T, 
    skip_word_number = F, simplify = F)
  
  ### Build vocabulary
  if (is.null(dic)) {
    word_vec_unlist <- unlist(word_vec_list)
    word_vec_table <- sort(table(word_vec_unlist), decreasing = T)
    word_cutoff <- which.max(word_vec_table < count_threshold)
    word_keep <- names(word_vec_table)[1:(word_cutoff - 1)]
    stopwords <- c(letters, "an", "the", "br")
    word_keep <- setdiff(word_keep, stopwords)
  } else word_keep <- names(dic)[!dic == 0]
  
  ### Clean the sentences to keep only the curated list of words
  word_vec_list <- lapply(word_vec_list, function(x) x[x %in% word_keep])
  
  # sentence_vec<- stri_split_boundaries(raw_vec, type='sentence', simplify = T)
  word_vec_length <- lapply(word_vec_list, length) %>% unlist()
  
  ### Build dictionnary
  dic <- 1:length(word_keep)
  names(dic) <- word_keep
  dic <- c(`¤` = 0, dic)
  
  ### reverse dictionnary
  rev_dic <- names(dic)
  names(rev_dic) <- dic
  
  return(list(word_vec_list = word_vec_list, dic = dic, rev_dic = rev_dic))
}

################################################################ 
make_bucket_data <- function(word_vec_list, labels, dic, seq_len = c(225), right_pad = T) {
  ### Trunc sequence to max bucket length
  word_vec_list <- lapply(word_vec_list, head, n = max(seq_len))
  
  word_vec_length <- lapply(word_vec_list, length) %>% unlist()
  bucketID <- cut(word_vec_length, breaks = c(0, seq_len, Inf), include.lowest = T, 
    labels = F)
  
  ### Right or Left side Padding Pad sequences to their bucket length with
  ### dictionnary 0-label
  word_vec_list_pad <- lapply(1:length(word_vec_list), function(x) {
    length(word_vec_list[[x]]) <- seq_len[bucketID[x]]
    word_vec_list[[x]][is.na(word_vec_list[[x]])] <- names(dic[1])
    if (right_pad == F) 
      word_vec_list[[x]] <- rev(word_vec_list[[x]])
    return(word_vec_list[[x]])
  })
  
  ### Assign sequences to buckets and unroll them in order to be reshaped into arrays
  unrolled_arrays <- lapply(1:length(seq_len), function(x) unlist(word_vec_list_pad[bucketID == 
    x]))
  
  ### Assign labels to their buckets
  bucketed_labels <- lapply(1:length(seq_len), function(x) labels[bucketID == x])
  names(bucketed_labels) <- as.character(seq_len)
  
  ### Assign the dictionnary to each bucket terms
  unrolled_arrays_dic <- lapply(1:length(seq_len), function(x) dic[unrolled_arrays[[x]]])
  
  # Reshape into arrays having each sequence into a row
  features <- lapply(1:length(seq_len), function(x) {
    array(unrolled_arrays_dic[[x]], 
          dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x]))
  })
  
  names(features) <- as.character(seq_len)
  
  ### Combine data and labels into buckets
  buckets <- lapply(1:length(seq_len), function(x) c(list(data = features[[x]]), 
    list(label = bucketed_labels[[x]])))
  names(buckets) <- as.character(seq_len)
  
  ### reverse dictionnary
  rev_dic <- names(dic)
  names(rev_dic) <- dic
  
  return(list(buckets = buckets, dic = dic, rev_dic = rev_dic))
}


corpus_preprocessed_train <- text_pre_process(corpus = train_raw, count_threshold = 10, 
  dic = NULL)

corpus_preprocessed_test <- text_pre_process(corpus = test_raw, dic = corpus_preprocessed_train$dic)

seq_length_dist <- unlist(lapply(corpus_preprocessed_train$word_vec_list, length))
quantile(seq_length_dist, 0:20/20)

corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list, 
                                          labels = rep(0:1, each = 12500), 
                                          dic = corpus_preprocessed_train$dic, 
                                          seq_len = c(100, 150, 250, 400, 600), 
                                          right_pad = T)

corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list, 
                                         labels = rep(0:1, each = 12500), 
                                         dic = corpus_preprocessed_test$dic, 
                                         seq_len = c(100, 150, 250, 400, 600), 
                                         right_pad = T)

saveRDS(corpus_bucketed_train, file = "data/corpus_bucketed_train.rds")
saveRDS(corpus_bucketed_test, file = "data/corpus_bucketed_test.rds")


corpus_single_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list, 
                                          labels = rep(0:1, each = 12500), 
                                          dic = corpus_preprocessed_train$dic, 
                                          seq_len = c(600), 
                                          right_pad = T)

corpus_single_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list, 
                                         labels = rep(0:1, each = 12500), 
                                         dic = corpus_preprocessed_test$dic, 
                                         seq_len = c(600), 
                                         right_pad = T)

saveRDS(corpus_single_train, file = "data/corpus_single_train.rds")
saveRDS(corpus_single_test, file = "data/corpus_single_test.rds")