Merge pull request #9 from ucd-cepb/tyler-edits

Tyler edits
ucd-cepb · Nov 7, 2024 · 7297895 · 7297895
2 parents 66d3951 + eae3329
commit 7297895
Show file tree

Hide file tree

Showing 15 changed files with 368 additions and 82 deletions.
diff --git a/R/clean_entities.R b/R/clean_entities.R
@@ -15,55 +15,73 @@
 #' @param concatenator Defaults to an underscore. Use regex notation. The concatenator used in elements of v.
 #' @return a cleaned vector of entity names
 
+#' @importFrom stringr str_remove_all
+#' @importFrom stringi stri_replace_all_regex
 #' @export
 #' 
 
 clean_entities <- function(v, remove_nums=T, remove_trailing_s=T, concatenator = "_"){
+  # Input validation
+  if (!is.vector(v)) {
+    stop("'v' must be a vector")
+  }
+
+  if (!is.logical(remove_nums)) {
+    stop("'remove_nums' must be a boolean value (TRUE/FALSE)")
+  }
+
+  if (!is.logical(remove_trailing_s)) {
+    stop("'remove_trailing_s' must be a boolean value (TRUE/FALSE)")
+  }
+
+  if (!is.character(concatenator) || length(concatenator) != 1) {
+    stop("'concatenator' must be a single character string")
+  }
 
   #format math font as regular font
   #unicode of math font alphabet
   maths <- c("\\U0001d44e", "\\U0001d44f", "\\U0001d450", "\\U0001d451", "\\U0001d452",
-  "\\U0001d453", "\\U0001d454", "\\u210e",     "\\U0001d456", "\\U0001d457",
-   "\\U0001d458", "\\U0001d459", "\\U0001d45a", "\\U0001d45b", "\\U0001d45c",
-   "\\U0001d45d", "\\U0001d45e", "\\U0001d45f", "\\U0001d460", "\\U0001d461",
-   "\\U0001d462", "\\U0001d463", "\\U0001d464", "\\U0001d465", "\\U0001d466",
-   "\\U0001d467", "\\U0001d434", "\\U0001d435", "\\U0001d436", "\\U0001d437",
-   "\\U0001d438", "\\U0001d439", "\\U0001d43a", "\\U0001d43b", "\\U0001d43c",
-   "\\U0001d43d", "\\U0001d43e", "\\U0001d43f", "\\U0001d440", "\\U0001d441",
-   "\\U0001d442", "\\U0001d443", "\\U0001d444", "\\U0001d445", "\\U0001d446",
-   "\\U0001d447", "\\U0001d448", "\\U0001d449", "\\U0001d44a", "\\U0001d44b",
-   "\\U0001d44c", "\\U0001d44d")
+             "\\U0001d453", "\\U0001d454", "\\u210e",     "\\U0001d456", "\\U0001d457",
+             "\\U0001d458", "\\U0001d459", "\\U0001d45a", "\\U0001d45b", "\\U0001d45c",
+             "\\U0001d45d", "\\U0001d45e", "\\U0001d45f", "\\U0001d460", "\\U0001d461",
+             "\\U0001d462", "\\U0001d463", "\\U0001d464", "\\U0001d465", "\\U0001d466",
+             "\\U0001d467", "\\U0001d434", "\\U0001d435", "\\U0001d436", "\\U0001d437",
+             "\\U0001d438", "\\U0001d439", "\\U0001d43a", "\\U0001d43b", "\\U0001d43c",
+             "\\U0001d43d", "\\U0001d43e", "\\U0001d43f", "\\U0001d440", "\\U0001d441",
+             "\\U0001d442", "\\U0001d443", "\\U0001d444", "\\U0001d445", "\\U0001d446",
+             "\\U0001d447", "\\U0001d448", "\\U0001d449", "\\U0001d44a", "\\U0001d44b",
+             "\\U0001d44c", "\\U0001d44d")
   letts <- c(letters,LETTERS)
 
-  v <- pbapply::pblapply(1:length(v), function(i){
-    stringi::stri_replace_all_regex(v[i], pattern = maths,
+  v <- lapply(1:length(v), function(i){
+    stri_replace_all_regex(v[i], pattern = maths,
                            replacement = letts,
                            vectorize= F)
   })
 
   if(remove_trailing_s==T){
     #remove strings with specific placement: trailing "'s"
     index <- which(grepl("'s$",v,perl = T))
-    v[index] <- stringr::str_remove_all(v[index],"'s$")
+    v[index] <- str_remove_all(v[index],"'s$")
 
     #remove strings with specific placement: trailing concatenator + s, e.g. "_s"
     index <- which(grepl(paste0(concatenator,"s$"),v,perl = T))
-    v[index] <- stringr::str_remove_all(v[index],paste0(concatenator,"s$"))
+    v[index] <- str_remove_all(v[index],paste0(concatenator,"s$"))
 
   }
 
   #next, remove all non-word characters
   remove <- c("\\W")
   index <- which(grepl(paste(remove,collapse = '|'),v,perl = T))
-  v[index] <- stringr::str_remove_all(v[index],paste(remove,collapse = '|'))
+  v[index] <- str_remove_all(v[index],paste(remove,collapse = '|'))
 
   #remove consecutive underscores that may have arisen due to previous cleaning step
   v <- gsub(paste0('(',concatenator,')\\1+'), '\\1', v)
 
   #remove leading or trailing underscores that may have arisen due to previous cleaning steps
   remove <- c(paste0("^",concatenator), paste0(concatenator, "$"))
   index <- which(grepl(paste(remove,collapse = '|'),v,perl = T))
-  v[index] <- stringr::str_remove_all(v[index],paste(remove,collapse = '|'))
+  v[index] <- str_remove_all(v[index],paste(remove,collapse = '|'))
 
   #remove entities that have no letters (or numbers, if remove_nums == F)
   if(remove_nums){

diff --git a/R/combine_networks.R b/R/combine_networks.R
@@ -7,24 +7,54 @@
 #' @param mode Either "multiplex" or "weighted" depending on the desired exported network format. If "weighted", collapses multiplex edges into a single weighted edge
 #' 
 #' @return Single igraph object that consolidates nodes and edges from input graphs. If there are multiple nodes with the same name and different attributes originating from different graphs, this function preserves the node attributes associated with the version that appears most commonly. Adds a node attribute num_graphs_in, which denotes the number of input graphs each node was found in. For a weighted graph, the weight is equal to the original number of edges between the respective source and target nodes. Edge attributes for a multiplex graph are described in the help file of textnet_extract. 
-#' @import data.table
+#' @importFrom data.table rbindlist
 #' @import igraph
 #' @importFrom methods is
 #' @export
 #' 
 
-combine_networks <- function(textnet_igraphs, mode){
+combine_networks <- function(textnet_igraphs, mode = c('multiplex','weighted')){
+  # Check if textnet_igraphs is missing
+  if(missing(textnet_igraphs)) {
+    stop("Argument 'textnet_igraphs' is missing. Must provide a list of igraph objects.")
+  }
+
+  # Check if textnet_igraphs is a list
   if(!is.list(textnet_igraphs)){
-    stop("please format your graphs as elements of a list.")
+    stop("Argument 'textnet_igraphs' must be a list of igraph objects.")
   }
-  num_graphs <- length(textnet_igraphs)
-  if(sum(sapply(1:length(textnet_igraphs), function(i) !is(textnet_igraphs[[i]], "igraph")))>0){
-    stop("combine_networks only accepts igraph objects.")
+
+  # Check if list is empty
+  if(length(textnet_igraphs) == 0) {
+    stop("Argument 'textnet_igraphs' cannot be an empty list.")
   }
-  if(!mode %in% c("multiplex","weighted")){
-    stop("Mode must be either multiplex or weighted.")
+
+  # Check if all elements are igraph objects
+  if(sum(sapply(textnet_igraphs, function(x) !is(x, "igraph"))) > 0){
+    stop("All elements in 'textnet_igraphs' must be igraph objects.")
+  }
+
+  # Check if mode is missing
+  if(missing(mode)) {
+    stop("Argument 'mode' is missing. Must be either 'multiplex' or 'weighted'.")
   }
 
+  # Check if mode is character
+  if(!is.character(mode)) {
+    stop("Argument 'mode' must be a character string ('multiplex' or 'weighted').")
+  }
+
+  # Check if mode is length 1
+  if(length(mode) != 1) {
+    stop("Argument 'mode' must be a single value ('multiplex' or 'weighted').")
+  }
+
+  # Check if mode has valid value
+  if(!mode %in% c("multiplex","weighted")){
+    stop("Argument 'mode' must be either 'multiplex' or 'weighted'.")
+  }
+
+  num_graphs <- length(textnet_igraphs)
   supernodes <- vector(mode = "list", length = length(num_graphs))
   superedges <- vector(mode = "list", length = length(num_graphs))
   for(m in 1:num_graphs){

diff --git a/R/crawl_sentence.R b/R/crawl_sentence.R
@@ -5,10 +5,26 @@
 #'
 #' @param s a data.frame containing the results of one (1) parsed spacy sentence
 #' @return list with original parsed sentence + added dependency parsing
-#' 
+#' @importFrom dplyr case_when
 #' 
 
 crawl_sentence <- function(s){ 
+  # Input validation
+  if (!is.data.frame(s)) {
+    stop("'s' must be a data.frame")
+  }
+
+  if (nrow(s) == 0) {
+    stop("'s' must contain at least one row")
+  }
+
+  # Required columns
+  required_cols <- c("pos", "dep_rel", "head_token_id", "lemma", "token", "token_id", "tag")
+  missing_cols <- required_cols[!required_cols %in% names(s)]
+  if (length(missing_cols) > 0) {
+    stop(paste0("Missing required columns in 's': ", paste(missing_cols, collapse=", ")))
+  }
+
 sentence <- s
 empty_list <- create_empty_sentence_parse_list(nrow(sentence))
 #tag auxes acting as auxes (eg "will" in "will attempt") for removal
@@ -51,7 +67,7 @@ empty_list$xcomp_verb <- ifelse(sentence$pos=="VERB" & sentence$dep_rel =="xcomp
     #this categorizes each word as either source or target and 
     #saves it as a new column. Identifies head_verb_id and saves it as a new column
     while(is.na(empty_list$source_or_target[tok_num]) & break_while_counter < 15){
-      empty_list$source_or_target[tok_num] <- dplyr::case_when(
+      empty_list$source_or_target[tok_num] <- case_when(
         #if head_token_id trail traces back to an appositive before hitting anything else --> NA
         #if you find an appositive, stop -- it's a duplicate and should not be counted
         sentence[current_token_id,dep_rel] %in% c("appos") ~ "appos",

diff --git a/R/disambiguate.R b/R/disambiguate.R
@@ -29,13 +29,41 @@
 #' @return a cleaned textnet extract. See textnet_extract help file for structure.
 #' 
 #' @import data.table
+#' @importFrom stringr str_detect str_remove_all str_replace_all
+#' @importFrom dplyr arrange desc filter
 #' @importFrom magrittr %>%
+#' @importFrom methods is
 #' @export
 #'
 
 #if recursive is true, runs it multiple times to reach the end of the chain.
 
 disambiguate <- function(textnet_extract, from, to, match_partial_entity=rep(F, length(from)), try_drop=NULL, recursive=T, concatenator="_"){
+  # Input validation
+  if(!is.list(from)) {
+    stop("'from' must be a list")
+  }
+
+  if(!is.list(to)) {
+    stop("'to' must be a list") 
+  }
+
+  if(!is.logical(match_partial_entity)) {
+    stop("'match_partial_entity' must be a logical vector")
+  }
+
+  if(!is.null(try_drop) && !is.character(try_drop)) {
+    stop("'try_drop' must be NULL or a character vector")
+  }
+
+  if(!is.logical(recursive)) {
+    stop("'recursive' must be a logical value")
+  }
+
+  if(!is.character(concatenator) || length(concatenator) != 1) {
+    stop("'concatenator' must be a single character string")
+  }
+
   options(warn=1)
   #Data formatting checks####
   multi_to <- sapply(1:length(to), function(w) length(to[[w]]) > 1)

diff --git a/R/download_dictionary.R b/R/download_dictionary.R
@@ -3,7 +3,9 @@
 
 #' Downloads the SCOWL 2020.12.07 dictionary created by Kevin Atkinson, which has a larger vocabulary than Qdap
 #' Code adapted from Mirek Długosz 2016 https://mirekdlugosz.com/blog/2016/how-to-use-r-to-recognize-if-given-string-is-a-word/
-
+#' @importFrom utils download.file unzip
+#' @importFrom tools file_ext
+#' @importFrom stringi stri_read_lines stri_escape_unicode
 
 download_dictionary <- function(){
 
@@ -33,4 +35,3 @@ download_dictionary <- function(){
   save(eng_words, file = "data/eng_words.rda")
 
 }
-
diff --git a/R/entity_consolidate_replicate.R b/R/entity_consolidate_replicate.R
@@ -6,21 +6,34 @@
 #' Instead, this function basically breaks off a feature of the original function and adds a new column to the original spacyr data.frame that is the concatenated entity.
 #' This feature is inefficient in that the concatenated entity is then replicated multiple times, but this does seem to be the easiest way to preserve the other data.
 #'
-#' @param x parsed spacy document
+#' @param x parsed spacy document in data.frame format
 #' @param concatenator A character that separates string segments when they are collapsed into a single entity. Defaults to "_"
 #' @param remove regex formatted strings to remove as entity components (like "the" in "the Seattle Supersonics")
 #' @return original data frame with added column for concatenated entity
 #' 
 #' @import data.table
+#' @importFrom stringr str_remove_all
 #'
 
-entity_consolidate_replicate <- function(x, concatenator = "_",remove = NULL) {
-  #Remove tokens that have no alphabet characters
+entity_consolidate_replicate <- function(x, concatenator = "_", remove = NULL) {
+  # Input validation
+  if (!is.data.frame(x)) {
+    stop("'x' must be a data.frame")
+  }
+
+  if (!is.character(concatenator) || length(concatenator) != 1) {
+    stop("'concatenator' must be a single character string")
+  }
 
-  spacy_result <- data.table::as.data.table(x)
+  if (!is.null(remove) && !is.character(remove)) {
+    stop("'remove' must be NULL or a character vector")
+  }
+
+  #Remove tokens that have no alphabet characters
+  spacy_result <- as.data.table(x)
   if(!is.null(remove)){
     index <- which(grepl(paste(remove,collapse = '|'),spacy_result$token,perl = T)&spacy_result$entity!="")
-    spacy_result$token[index] <- stringr::str_remove_all(spacy_result$token[index],paste(remove,collapse = '|'))
+    spacy_result$token[index] <- str_remove_all(spacy_result$token[index],paste(remove,collapse = '|'))
     spacy_result$entity[index] <- ""
   }
 

diff --git a/R/export_to_network.R b/R/export_to_network.R
@@ -31,20 +31,41 @@
 #'    \item num_communities -- number of communities using louvain cluster algorithm on a weighted, undirected, non-multiplex version of the network
 #'    \item percent_vbn, percent_vbg, percent_vpb, percent_vbd, percent_vb, percent_vbz -- percent of edges in the graph that are of the respective verb tense
 #' }
+#' @import igraph
+#' @importFrom dplyr filter
+#' @importFrom network network network.size network.edgecount network.density
+#' @importFrom sna connectedness centralization gtrans
+#' @importFrom stats median
+#' @importFrom base unique
 #' @export
 #'
 
 export_to_network <- function(textnet_extract, export_format, keep_isolates=T, collapse_edges, self_loops){
+  # Input validation
+  if(!is.list(textnet_extract)) {
+    stop("'textnet_extract' must be a list")
+  }
+
+  if(!is.character(export_format) || length(export_format) != 1) {
+    stop("'export_format' must be a single character string")
+  }
+
   if(!export_format %in% c("igraph","network")){
     stop("export_format must be either 'igraph' or 'network'")
   }
-  if(!keep_isolates %in% c(T,F)){
-    stop("keep_isolates must be either T or F.")
+
+  if(!is.logical(keep_isolates) || length(keep_isolates) != 1) {
+    stop("'keep_isolates' must be a single logical value")
   }
-  if(!collapse_edges %in% c(T,F)){
-    stop("collapse_edges must be either T or F.")
+
+  if(!is.logical(collapse_edges) || length(collapse_edges) != 1) {
+    stop("'collapse_edges' must be a single logical value") 
   }
 
+  if(!is.logical(self_loops) || length(self_loops) != 1) {
+    stop("'self_loops' must be a single logical value")
+  }
+
   textnet_extract$edgelist <-  dplyr::filter(textnet_extract$edgelist, !is.na(textnet_extract$edgelist$source) & !is.na(textnet_extract$edgelist$target))
 
 
@@ -186,4 +207,3 @@ export_to_network <- function(textnet_extract, export_format, keep_isolates=T, c
 
 }
 
-
diff --git a/R/filter_sentences.R b/R/filter_sentences.R
@@ -15,20 +15,35 @@
 #' @param case_sensitive Whether the token is required to have the specific casing used in the dictionary to count as a match. Defaults to F.
 #' @return a cleaned version of 'file', keeping only the sentences that pass the threshold requirement.
 #' @importFrom stats aggregate
+#' @importFrom dplyr left_join
 
 #' @export
 #' 
 
 filter_sentences <- function(file, filter = textNet::eng_words, 
                              percent_threshold = 40, case_sensitive = F){
-#TODO class type checks
-  if(case_sensitive == F){
+  # Input validation
+  if(!is.data.frame(file)) {
+    stop("'file' must be a data frame")
+  }
+
+  if(!is.character(filter)) {
+    stop("'filter' must be a character vector")
+  }
+
+  if(!is.numeric(percent_threshold) || percent_threshold < 0 || percent_threshold > 100) {
+    stop("'percent_threshold' must be a numeric value between 0 and 100")
+  }
+
+  if(!is.logical(case_sensitive)) {
+    stop("'case_sensitive' must be a logical value (TRUE/FALSE)")
+  }
+
+  if(!case_sensitive){
     filter <- tolower(filter)
     tokens <- tolower(file$token)
-  }else if(case_sensitive ==T){
+  }else if(case_sensitive){
     tokens <- file$token
-  }else{
-    stop("case_sensitive must be T or F.")
   }
    eng <- tokens %in% filter
    percent_pass_filter <- aggregate(eng, 
@@ -39,6 +54,6 @@ filter_sentences <- function(file, filter = textNet::eng_words,
    percent_pass_filter$x <- NULL
    colnames(percent_pass_filter) <- c("sentence_id", "doc_id")
 
-   file <- dplyr::left_join(percent_pass_filter, file)
+   file <- left_join(percent_pass_filter, file)
    return(file)
 }