diff --git a/R/clean_entities.R b/R/clean_entities.R index 9e64d17..32ba8a5 100644 --- a/R/clean_entities.R +++ b/R/clean_entities.R @@ -15,28 +15,46 @@ #' @param concatenator Defaults to an underscore. Use regex notation. The concatenator used in elements of v. #' @return a cleaned vector of entity names +#' @importFrom stringr str_remove_all +#' @importFrom stringi stri_replace_all_regex #' @export #' clean_entities <- function(v, remove_nums=T, remove_trailing_s=T, concatenator = "_"){ + # Input validation + if (!is.vector(v)) { + stop("'v' must be a vector") + } + + if (!is.logical(remove_nums)) { + stop("'remove_nums' must be a boolean value (TRUE/FALSE)") + } + + if (!is.logical(remove_trailing_s)) { + stop("'remove_trailing_s' must be a boolean value (TRUE/FALSE)") + } + + if (!is.character(concatenator) || length(concatenator) != 1) { + stop("'concatenator' must be a single character string") + } #format math font as regular font #unicode of math font alphabet maths <- c("\\U0001d44e", "\\U0001d44f", "\\U0001d450", "\\U0001d451", "\\U0001d452", - "\\U0001d453", "\\U0001d454", "\\u210e", "\\U0001d456", "\\U0001d457", - "\\U0001d458", "\\U0001d459", "\\U0001d45a", "\\U0001d45b", "\\U0001d45c", - "\\U0001d45d", "\\U0001d45e", "\\U0001d45f", "\\U0001d460", "\\U0001d461", - "\\U0001d462", "\\U0001d463", "\\U0001d464", "\\U0001d465", "\\U0001d466", - "\\U0001d467", "\\U0001d434", "\\U0001d435", "\\U0001d436", "\\U0001d437", - "\\U0001d438", "\\U0001d439", "\\U0001d43a", "\\U0001d43b", "\\U0001d43c", - "\\U0001d43d", "\\U0001d43e", "\\U0001d43f", "\\U0001d440", "\\U0001d441", - "\\U0001d442", "\\U0001d443", "\\U0001d444", "\\U0001d445", "\\U0001d446", - "\\U0001d447", "\\U0001d448", "\\U0001d449", "\\U0001d44a", "\\U0001d44b", - "\\U0001d44c", "\\U0001d44d") + "\\U0001d453", "\\U0001d454", "\\u210e", "\\U0001d456", "\\U0001d457", + "\\U0001d458", "\\U0001d459", "\\U0001d45a", "\\U0001d45b", "\\U0001d45c", + "\\U0001d45d", "\\U0001d45e", "\\U0001d45f", "\\U0001d460", "\\U0001d461", + "\\U0001d462", "\\U0001d463", "\\U0001d464", "\\U0001d465", "\\U0001d466", + "\\U0001d467", "\\U0001d434", "\\U0001d435", "\\U0001d436", "\\U0001d437", + "\\U0001d438", "\\U0001d439", "\\U0001d43a", "\\U0001d43b", "\\U0001d43c", + "\\U0001d43d", "\\U0001d43e", "\\U0001d43f", "\\U0001d440", "\\U0001d441", + "\\U0001d442", "\\U0001d443", "\\U0001d444", "\\U0001d445", "\\U0001d446", + "\\U0001d447", "\\U0001d448", "\\U0001d449", "\\U0001d44a", "\\U0001d44b", + "\\U0001d44c", "\\U0001d44d") letts <- c(letters,LETTERS) - v <- pbapply::pblapply(1:length(v), function(i){ - stringi::stri_replace_all_regex(v[i], pattern = maths, + v <- lapply(1:length(v), function(i){ + stri_replace_all_regex(v[i], pattern = maths, replacement = letts, vectorize= F) }) @@ -44,18 +62,18 @@ clean_entities <- function(v, remove_nums=T, remove_trailing_s=T, concatenator = if(remove_trailing_s==T){ #remove strings with specific placement: trailing "'s" index <- which(grepl("'s$",v,perl = T)) - v[index] <- stringr::str_remove_all(v[index],"'s$") + v[index] <- str_remove_all(v[index],"'s$") #remove strings with specific placement: trailing concatenator + s, e.g. "_s" index <- which(grepl(paste0(concatenator,"s$"),v,perl = T)) - v[index] <- stringr::str_remove_all(v[index],paste0(concatenator,"s$")) + v[index] <- str_remove_all(v[index],paste0(concatenator,"s$")) } #next, remove all non-word characters remove <- c("\\W") index <- which(grepl(paste(remove,collapse = '|'),v,perl = T)) - v[index] <- stringr::str_remove_all(v[index],paste(remove,collapse = '|')) + v[index] <- str_remove_all(v[index],paste(remove,collapse = '|')) #remove consecutive underscores that may have arisen due to previous cleaning step v <- gsub(paste0('(',concatenator,')\\1+'), '\\1', v) @@ -63,7 +81,7 @@ clean_entities <- function(v, remove_nums=T, remove_trailing_s=T, concatenator = #remove leading or trailing underscores that may have arisen due to previous cleaning steps remove <- c(paste0("^",concatenator), paste0(concatenator, "$")) index <- which(grepl(paste(remove,collapse = '|'),v,perl = T)) - v[index] <- stringr::str_remove_all(v[index],paste(remove,collapse = '|')) + v[index] <- str_remove_all(v[index],paste(remove,collapse = '|')) #remove entities that have no letters (or numbers, if remove_nums == F) if(remove_nums){ diff --git a/R/combine_networks.R b/R/combine_networks.R index 0d6cf7b..3e1e031 100644 --- a/R/combine_networks.R +++ b/R/combine_networks.R @@ -7,24 +7,54 @@ #' @param mode Either "multiplex" or "weighted" depending on the desired exported network format. If "weighted", collapses multiplex edges into a single weighted edge #' #' @return Single igraph object that consolidates nodes and edges from input graphs. If there are multiple nodes with the same name and different attributes originating from different graphs, this function preserves the node attributes associated with the version that appears most commonly. Adds a node attribute num_graphs_in, which denotes the number of input graphs each node was found in. For a weighted graph, the weight is equal to the original number of edges between the respective source and target nodes. Edge attributes for a multiplex graph are described in the help file of textnet_extract. -#' @import data.table +#' @importFrom data.table rbindlist #' @import igraph #' @importFrom methods is #' @export #' -combine_networks <- function(textnet_igraphs, mode){ +combine_networks <- function(textnet_igraphs, mode = c('multiplex','weighted')){ + # Check if textnet_igraphs is missing + if(missing(textnet_igraphs)) { + stop("Argument 'textnet_igraphs' is missing. Must provide a list of igraph objects.") + } + + # Check if textnet_igraphs is a list if(!is.list(textnet_igraphs)){ - stop("please format your graphs as elements of a list.") + stop("Argument 'textnet_igraphs' must be a list of igraph objects.") } - num_graphs <- length(textnet_igraphs) - if(sum(sapply(1:length(textnet_igraphs), function(i) !is(textnet_igraphs[[i]], "igraph")))>0){ - stop("combine_networks only accepts igraph objects.") + + # Check if list is empty + if(length(textnet_igraphs) == 0) { + stop("Argument 'textnet_igraphs' cannot be an empty list.") } - if(!mode %in% c("multiplex","weighted")){ - stop("Mode must be either multiplex or weighted.") + + # Check if all elements are igraph objects + if(sum(sapply(textnet_igraphs, function(x) !is(x, "igraph"))) > 0){ + stop("All elements in 'textnet_igraphs' must be igraph objects.") + } + + # Check if mode is missing + if(missing(mode)) { + stop("Argument 'mode' is missing. Must be either 'multiplex' or 'weighted'.") } + # Check if mode is character + if(!is.character(mode)) { + stop("Argument 'mode' must be a character string ('multiplex' or 'weighted').") + } + + # Check if mode is length 1 + if(length(mode) != 1) { + stop("Argument 'mode' must be a single value ('multiplex' or 'weighted').") + } + + # Check if mode has valid value + if(!mode %in% c("multiplex","weighted")){ + stop("Argument 'mode' must be either 'multiplex' or 'weighted'.") + } + + num_graphs <- length(textnet_igraphs) supernodes <- vector(mode = "list", length = length(num_graphs)) superedges <- vector(mode = "list", length = length(num_graphs)) for(m in 1:num_graphs){ diff --git a/R/crawl_sentence.R b/R/crawl_sentence.R index 2ec354d..8cf1a79 100644 --- a/R/crawl_sentence.R +++ b/R/crawl_sentence.R @@ -5,10 +5,26 @@ #' #' @param s a data.frame containing the results of one (1) parsed spacy sentence #' @return list with original parsed sentence + added dependency parsing -#' +#' @importFrom dplyr case_when #' crawl_sentence <- function(s){ + # Input validation + if (!is.data.frame(s)) { + stop("'s' must be a data.frame") + } + + if (nrow(s) == 0) { + stop("'s' must contain at least one row") + } + + # Required columns + required_cols <- c("pos", "dep_rel", "head_token_id", "lemma", "token", "token_id", "tag") + missing_cols <- required_cols[!required_cols %in% names(s)] + if (length(missing_cols) > 0) { + stop(paste0("Missing required columns in 's': ", paste(missing_cols, collapse=", "))) + } + sentence <- s empty_list <- create_empty_sentence_parse_list(nrow(sentence)) #tag auxes acting as auxes (eg "will" in "will attempt") for removal @@ -51,7 +67,7 @@ empty_list$xcomp_verb <- ifelse(sentence$pos=="VERB" & sentence$dep_rel =="xcomp #this categorizes each word as either source or target and #saves it as a new column. Identifies head_verb_id and saves it as a new column while(is.na(empty_list$source_or_target[tok_num]) & break_while_counter < 15){ - empty_list$source_or_target[tok_num] <- dplyr::case_when( + empty_list$source_or_target[tok_num] <- case_when( #if head_token_id trail traces back to an appositive before hitting anything else --> NA #if you find an appositive, stop -- it's a duplicate and should not be counted sentence[current_token_id,dep_rel] %in% c("appos") ~ "appos", diff --git a/R/disambiguate.R b/R/disambiguate.R index 5b1c1e0..15e2e4d 100644 --- a/R/disambiguate.R +++ b/R/disambiguate.R @@ -29,13 +29,41 @@ #' @return a cleaned textnet extract. See textnet_extract help file for structure. #' #' @import data.table +#' @importFrom stringr str_detect str_remove_all str_replace_all +#' @importFrom dplyr arrange desc filter #' @importFrom magrittr %>% +#' @importFrom methods is #' @export #' #if recursive is true, runs it multiple times to reach the end of the chain. disambiguate <- function(textnet_extract, from, to, match_partial_entity=rep(F, length(from)), try_drop=NULL, recursive=T, concatenator="_"){ + # Input validation + if(!is.list(from)) { + stop("'from' must be a list") + } + + if(!is.list(to)) { + stop("'to' must be a list") + } + + if(!is.logical(match_partial_entity)) { + stop("'match_partial_entity' must be a logical vector") + } + + if(!is.null(try_drop) && !is.character(try_drop)) { + stop("'try_drop' must be NULL or a character vector") + } + + if(!is.logical(recursive)) { + stop("'recursive' must be a logical value") + } + + if(!is.character(concatenator) || length(concatenator) != 1) { + stop("'concatenator' must be a single character string") + } + options(warn=1) #Data formatting checks#### multi_to <- sapply(1:length(to), function(w) length(to[[w]]) > 1) diff --git a/R/download_dictionary.R b/R/download_dictionary.R index 9f21544..117f584 100644 --- a/R/download_dictionary.R +++ b/R/download_dictionary.R @@ -3,7 +3,9 @@ #' Downloads the SCOWL 2020.12.07 dictionary created by Kevin Atkinson, which has a larger vocabulary than Qdap #' Code adapted from Mirek Długosz 2016 https://mirekdlugosz.com/blog/2016/how-to-use-r-to-recognize-if-given-string-is-a-word/ - +#' @importFrom utils download.file unzip +#' @importFrom tools file_ext +#' @importFrom stringi stri_read_lines stri_escape_unicode download_dictionary <- function(){ @@ -33,4 +35,3 @@ download_dictionary <- function(){ save(eng_words, file = "data/eng_words.rda") } - diff --git a/R/entity_consolidate_replicate.R b/R/entity_consolidate_replicate.R index 5212fef..9a3fc87 100644 --- a/R/entity_consolidate_replicate.R +++ b/R/entity_consolidate_replicate.R @@ -6,21 +6,34 @@ #' Instead, this function basically breaks off a feature of the original function and adds a new column to the original spacyr data.frame that is the concatenated entity. #' This feature is inefficient in that the concatenated entity is then replicated multiple times, but this does seem to be the easiest way to preserve the other data. #' -#' @param x parsed spacy document +#' @param x parsed spacy document in data.frame format #' @param concatenator A character that separates string segments when they are collapsed into a single entity. Defaults to "_" #' @param remove regex formatted strings to remove as entity components (like "the" in "the Seattle Supersonics") #' @return original data frame with added column for concatenated entity #' #' @import data.table +#' @importFrom stringr str_remove_all #' -entity_consolidate_replicate <- function(x, concatenator = "_",remove = NULL) { - #Remove tokens that have no alphabet characters +entity_consolidate_replicate <- function(x, concatenator = "_", remove = NULL) { + # Input validation + if (!is.data.frame(x)) { + stop("'x' must be a data.frame") + } + + if (!is.character(concatenator) || length(concatenator) != 1) { + stop("'concatenator' must be a single character string") + } - spacy_result <- data.table::as.data.table(x) + if (!is.null(remove) && !is.character(remove)) { + stop("'remove' must be NULL or a character vector") + } + + #Remove tokens that have no alphabet characters + spacy_result <- as.data.table(x) if(!is.null(remove)){ index <- which(grepl(paste(remove,collapse = '|'),spacy_result$token,perl = T)&spacy_result$entity!="") - spacy_result$token[index] <- stringr::str_remove_all(spacy_result$token[index],paste(remove,collapse = '|')) + spacy_result$token[index] <- str_remove_all(spacy_result$token[index],paste(remove,collapse = '|')) spacy_result$entity[index] <- "" } diff --git a/R/export_to_network.R b/R/export_to_network.R index c19e1b4..49a1f8e 100644 --- a/R/export_to_network.R +++ b/R/export_to_network.R @@ -31,20 +31,41 @@ #' \item num_communities -- number of communities using louvain cluster algorithm on a weighted, undirected, non-multiplex version of the network #' \item percent_vbn, percent_vbg, percent_vpb, percent_vbd, percent_vb, percent_vbz -- percent of edges in the graph that are of the respective verb tense #' } +#' @import igraph +#' @importFrom dplyr filter +#' @importFrom network network network.size network.edgecount network.density +#' @importFrom sna connectedness centralization gtrans +#' @importFrom stats median +#' @importFrom base unique #' @export #' export_to_network <- function(textnet_extract, export_format, keep_isolates=T, collapse_edges, self_loops){ + # Input validation + if(!is.list(textnet_extract)) { + stop("'textnet_extract' must be a list") + } + + if(!is.character(export_format) || length(export_format) != 1) { + stop("'export_format' must be a single character string") + } + if(!export_format %in% c("igraph","network")){ stop("export_format must be either 'igraph' or 'network'") } - if(!keep_isolates %in% c(T,F)){ - stop("keep_isolates must be either T or F.") + + if(!is.logical(keep_isolates) || length(keep_isolates) != 1) { + stop("'keep_isolates' must be a single logical value") } - if(!collapse_edges %in% c(T,F)){ - stop("collapse_edges must be either T or F.") + + if(!is.logical(collapse_edges) || length(collapse_edges) != 1) { + stop("'collapse_edges' must be a single logical value") } + if(!is.logical(self_loops) || length(self_loops) != 1) { + stop("'self_loops' must be a single logical value") + } + textnet_extract$edgelist <- dplyr::filter(textnet_extract$edgelist, !is.na(textnet_extract$edgelist$source) & !is.na(textnet_extract$edgelist$target)) @@ -186,4 +207,3 @@ export_to_network <- function(textnet_extract, export_format, keep_isolates=T, c } - diff --git a/R/filter_sentences.R b/R/filter_sentences.R index 5aa94e5..bdebb75 100644 --- a/R/filter_sentences.R +++ b/R/filter_sentences.R @@ -15,20 +15,35 @@ #' @param case_sensitive Whether the token is required to have the specific casing used in the dictionary to count as a match. Defaults to F. #' @return a cleaned version of 'file', keeping only the sentences that pass the threshold requirement. #' @importFrom stats aggregate +#' @importFrom dplyr left_join #' @export #' filter_sentences <- function(file, filter = textNet::eng_words, percent_threshold = 40, case_sensitive = F){ -#TODO class type checks - if(case_sensitive == F){ + # Input validation + if(!is.data.frame(file)) { + stop("'file' must be a data frame") + } + + if(!is.character(filter)) { + stop("'filter' must be a character vector") + } + + if(!is.numeric(percent_threshold) || percent_threshold < 0 || percent_threshold > 100) { + stop("'percent_threshold' must be a numeric value between 0 and 100") + } + + if(!is.logical(case_sensitive)) { + stop("'case_sensitive' must be a logical value (TRUE/FALSE)") + } + + if(!case_sensitive){ filter <- tolower(filter) tokens <- tolower(file$token) - }else if(case_sensitive ==T){ + }else if(case_sensitive){ tokens <- file$token - }else{ - stop("case_sensitive must be T or F.") } eng <- tokens %in% filter percent_pass_filter <- aggregate(eng, @@ -39,6 +54,6 @@ filter_sentences <- function(file, filter = textNet::eng_words, percent_pass_filter$x <- NULL colnames(percent_pass_filter) <- c("sentence_id", "doc_id") - file <- dplyr::left_join(percent_pass_filter, file) + file <- left_join(percent_pass_filter, file) return(file) } \ No newline at end of file diff --git a/R/find_acronyms.R b/R/find_acronyms.R index 198db46..b0c153c 100644 --- a/R/find_acronyms.R +++ b/R/find_acronyms.R @@ -5,7 +5,9 @@ #' #' @param str A character vector #' -#' @import data.table +#' @importFrom data.table setDT setcolorder rbind +#' @importFrom stringr str_split str_remove_all str_replace_all +#' @importFrom stringi stri_match_last stri_match_all #' @return a data table with a "name" column and an "acronym" column representing its associated acronym. #' Each row corresponds to a unique match in the document. @@ -14,29 +16,34 @@ #' find_acronyms <- function(str){ - paren_splits <- stringr::str_split(str, pattern = "\\)") + # Input validation + if(!is.character(str)) { + stop("'str' must be a character vector") + } + + paren_splits <- str_split(str, pattern = "\\)") paren_splits2 <- lapply(paren_splits, function (k) k[nchar(k)>0]) paren_splits3 <- lapply(paren_splits2, function(m) stringr::str_split(m, pattern = "\\(")) paren_splits4 <- lapply(paren_splits3, function (j) lapply(j, function(m) m[length(m)==2])) paren_splits4 <- unlist(paren_splits4, recursive=F) paren_splits5 <- do.call(rbind, paren_splits4) - paren_splits$acr1 <- stringi::stri_match_last(str = paren_splits5[,1], regex ="\\b[A-Z]+\\b") - paren_splits$acr2 <- stringi::stri_match_all(str = paren_splits5[,2], regex ="\\b[A-Z]+\\b") - paren_splits$abb1 <- stringr::str_remove_all(paren_splits5[,1],"[^A-Z]") + paren_splits$acr1 <- stri_match_last(str = paren_splits5[,1], regex ="\\b[A-Z]+\\b") + paren_splits$acr2 <- stri_match_all(str = paren_splits5[,2], regex ="\\b[A-Z]+\\b") + paren_splits$abb1 <- str_remove_all(paren_splits5[,1],"[^A-Z]") paren_splits$abb1 <- sapply(1:length(paren_splits$acr2), function(s) sapply(1:length(paren_splits$acr2[[s]]), function (m){ - stringi::stri_match_last(str=paren_splits$abb1[s], + stri_match_last(str=paren_splits$abb1[s], regex = paren_splits$acr2[[s]][m]) })) sp_lower <- "[\\s|a-z]+" paren_splits$name1 <- sapply(1:length(paren_splits$acr2), function(s) sapply(1:length(paren_splits$acr2[[s]]), function (m){ - stringi::stri_match_last(str=paren_splits5[s,1], regex = paste0(paste0(stringr::str_split(paren_splits$acr2[[s]][m],pattern="")[[1]],collapse=sp_lower),"[a-z]+")) + stri_match_last(str=paren_splits5[s,1], regex = paste0(paste0(stringr::str_split(paren_splits$acr2[[s]][m],pattern="")[[1]],collapse=sp_lower),"[a-z]+")) })) - paren_splits$abb2 <- stringr::str_remove_all(paren_splits5[,2],"[^A-Z]") + paren_splits$abb2 <- str_remove_all(paren_splits5[,2],"[^A-Z]") paren_splits$name2 <- sapply(1:length(paren_splits$abb2), function(s) - stringi::stri_match_last(str=paren_splits5[s,2], regex = paste0(paste0(stringr::str_split(paren_splits$acr1[s],pattern="")[[1]],collapse=sp_lower),"[a-z]+")) + stri_match_last(str=paren_splits5[s,2], regex = paste0(paste0(stringr::str_split(paren_splits$acr1[s],pattern="")[[1]],collapse=sp_lower),"[a-z]+")) ) paren_splits$acr1 <- as.vector(paren_splits$acr1) @@ -45,13 +52,13 @@ find_acronyms <- function(str){ paren_splits$acr2 <- unlist(paren_splits$acr2) paren_splits$name1 <- unlist(paren_splits$name1) - acronym_matches <- data.table::setDT(list("name" = paren_splits$name2,"acronym" = paren_splits$acr1)) - acronym_matches2 <- data.table::setDT(list("name" = paren_splits$name1, "acronym" = paren_splits$acr2)) + acronym_matches <- setDT(list("name" = paren_splits$name2,"acronym" = paren_splits$acr1)) + acronym_matches2 <- setDT(list("name" = paren_splits$name1, "acronym" = paren_splits$acr2)) acronym_matches <- rbind(acronym_matches, acronym_matches2) acronym_matches <- acronym_matches[!is.na(acronym) & !is.na(name) &nchar(acronym)>1,] - acronym_matches$name <- stringr::str_replace_all(acronym_matches$name,"-|\\s+","_") + acronym_matches$name <- str_replace_all(acronym_matches$name,"-|\\s+","_") #change hyphens and spaces to underscores, since in spacyparse they are treated as separate tokens @@ -63,6 +70,6 @@ find_acronyms <- function(str){ acronym_matches <- acronym_matches[N==1,] acronym_matches <- acronym_matches[,N:=NULL] #reorder cols - data.table::setcolorder(x=acronym_matches,neworder=c("name", "acronym")) + setcolorder(x=acronym_matches,neworder=c("name", "acronym")) return(acronym_matches) } \ No newline at end of file diff --git a/R/parse_text.R b/R/parse_text.R index 59fc16b..c498322 100644 --- a/R/parse_text.R +++ b/R/parse_text.R @@ -12,13 +12,48 @@ #' @param overwrite A boolean. Whether to overwrite existing files #' @param custom_entities A named list. This does not overwrite the entity determination of the NLP engine, but rather catches user-defined entities that are not otherwise detected by the engine. Best used in combination with phrases_to_concatenate, since the custom entity label will only be applied if the entire token matches the definition. Does not search multiple consecutive tokens to define a match. These will be applied to all documents. #' @return A data.frame of tokens. For more information on the format, see the spacyr::spacy_parse help file -#' @import data.table +#' @importFrom data.table setDT +#' @importFrom stringr str_detect str_replace_all +#' @importFrom stringi stri_replace_all_regex stri_escape_unicode +#' @importFrom pbapply pblapply +#' @importFrom utils data +#' @importFrom reticulate py_config +#' @importFrom spacyr spacy_initialize spacy_parse spacy_finalize #' @export parse_text <- function(ret_path, keep_hyph_together=F, phrases_to_concatenate=NA, concatenator="_", text_list, parsed_filenames, overwrite=T, custom_entities = NULL){ + # Input validation + if(!is.character(ret_path) || length(ret_path) != 1) { + stop("'ret_path' must be a single character string") + } + + if(!is.logical(keep_hyph_together) || length(keep_hyph_together) != 1) { + stop("'keep_hyph_together' must be a single logical value") + } + + if(!is.character(phrases_to_concatenate) && !is.na(phrases_to_concatenate)) { + stop("'phrases_to_concatenate' must be either NA or a character vector") + } + + if(!is.character(concatenator) || length(concatenator) != 1) { + stop("'concatenator' must be a single character string") + } + + if(!is.list(text_list)) { + stop("'text_list' must be a list") + } + + if(!is.character(parsed_filenames)) { + stop("'parsed_filenames' must be a character vector") + } + + if(!is.logical(overwrite) || length(overwrite) != 1) { + stop("'overwrite' must be a single logical value") + } + if(!is.null(custom_entities)){ if(!is.list(custom_entities) | is.null(names(custom_entities)) | "" %in% names(custom_entities)){ @@ -45,7 +80,7 @@ parse_text <- function(ret_path, keep_hyph_together=F, phrases_to_concatenate=NA #automatically gets rid of phrases without a space - phrases_to_concatenate <- phrases_to_concatenate[stringr::str_detect(phrases_to_concatenate,"\\s")] + phrases_to_concatenate <- phrases_to_concatenate[str_detect(phrases_to_concatenate,"\\s")] #generate phrases defaults to false, since it appears spaCy has a more difficult time #identifying proper name phrases linked by underscore as entities @@ -80,8 +115,8 @@ parse_text <- function(ret_path, keep_hyph_together=F, phrases_to_concatenate=NA dependency = T, nounphrase = T) saveRDS(parsedtxt, parsed_filenames[m]) - lettertokens <- parsedtxt$token[stringr::str_detect(parsedtxt$token, "[a-zA-Z]")] - lettertokensunicodeescaped <- stringi::stri_escape_unicode(lettertokens) + lettertokens <- parsedtxt$token[str_detect(parsedtxt$token, "[a-zA-Z]")] + lettertokensunicodeescaped <- stri_escape_unicode(lettertokens) utils::data(eng_words) pctlettersineng <- sum(lettertokensunicodeescaped %in% eng_words)/length(lettertokensunicodeescaped) @@ -101,7 +136,7 @@ parse_text <- function(ret_path, keep_hyph_together=F, phrases_to_concatenate=NA spacyr::spacy_finalize() for(k in 1:length(custom_entities)){ - custom_entities[[k]] <- stringr::str_replace_all(custom_entities[[k]] ,"\\s",concatenator) + custom_entities[[k]] <- str_replace_all(custom_entities[[k]] ,"\\s",concatenator) all_parsed <- lapply(1:length(all_parsed), function (i){ all_parsed[[i]][all_parsed[[i]]$token %in% custom_entities[[k]] & all_parsed[[i]]$entity=="",]$entity <- paste0(names(custom_entities[k]), "_B") return(all_parsed[[i]]) @@ -111,4 +146,3 @@ parse_text <- function(ret_path, keep_hyph_together=F, phrases_to_concatenate=NA return(all_parsed) } - diff --git a/R/pdf_clean.R b/R/pdf_clean.R index ca187a7..057b58a 100644 --- a/R/pdf_clean.R +++ b/R/pdf_clean.R @@ -10,9 +10,7 @@ #' Entities that have no letters are removed, if remove_nums is set to T. #' #' @param pdfs a vector of file names -#' @param keep_pages A list of logical vectors, such that each vector in the list represents a pdf -#' and each element of the vector represents a page. All pages for which keep_pages == T will be included -#' in the exported file. keep_pages defaults to a single logical value: T, which keeps all pages. +#' @param keep_pages By default, NULL keeps all pages. Alternatively, the user can specify a numeric vector of pages to keep in the exported file. Duplicate values will be returned only once, and the keep_pages vector will be sorted before processing. #' @param ocr A logical value: T to run ocr image-to-text detection on pages with fewer than 20 characters; #' F to not run ocr on any pages. #' @param maxchar A numeric value representing the maximum allowable number of characters per page, @@ -31,13 +29,56 @@ #' number of files is equal to the number of pdfs. #' @importFrom pdftools pdf_text pdf_ocr_text #' @importFrom methods is +#' @importFrom stringr str_split str_detect str_remove #' @export #' -pdf_clean <- function(pdfs, keep_pages=T, ocr=F, maxchar=10000, export_paths=NULL, return_to_memory=T, suppressWarn = F, auto_headfoot_remove = T){ +pdf_clean <- function(pdfs, keep_pages=NULL, ocr=F, maxchar=10000, export_paths=NULL, return_to_memory=T, suppressWarn = F, auto_headfoot_remove = T){ + # Input validation + if(!is.character(pdfs)) { + stop("'pdfs' must be a character vector of file names") + } + + if(!is.null(keep_pages) && (!is.numeric(keep_pages) || any(keep_pages < 1) || any(keep_pages %% 1 != 0))) { + stop("'keep_pages' must be NULL or a vector of positive integers") + } + + if(!is.logical(ocr) || length(ocr) != 1) { + stop("'ocr' must be a single logical value") + } + + if(!is.numeric(maxchar) || length(maxchar) != 1 || maxchar <= 0) { + stop("'maxchar' must be a single positive numeric value") + } + + if(!is.null(export_paths) && !is.character(export_paths)) { + stop("'export_paths' must be NULL or a character vector") + } + + if(!is.logical(return_to_memory) || length(return_to_memory) != 1) { + stop("'return_to_memory' must be a single logical value") + } + + if(!is.logical(suppressWarn) || length(suppressWarn) != 1) { + stop("'suppressWarn' must be a single logical value") + } + + if(!is.logical(auto_headfoot_remove) || length(auto_headfoot_remove) != 1) { + stop("'auto_headfoot_remove' must be a single logical value") + } + if(return_to_memory==F & is.null(export_paths)){ stop("Either return_to_memory must be true or export_paths must be non-null.") } + if(!is.null(keep_pages)){ + if(any(keep_pages) > length(texts)){ + stop("pages to keep exceeds the page length of the input document") + } + if(any(keep_pages) < 1 | any(!keep_pages%%1 == F)){ + stop("keep_pages must only be positive integers") + } + if(!is.numeric(keep_pages)){stop("keep_pages must be NULL or a numeric vector")} + } if(return_to_memory){ all_pdfs <- vector(mode = "list", length = length(pdfs)) } @@ -47,11 +88,8 @@ pdf_clean <- function(pdfs, keep_pages=T, ocr=F, maxchar=10000, export_paths=NUL }else{ texts <- pdf_text(pdfs[[k]]) } - - if(is.list(keep_pages)){ - texts <- texts[keep_pages[[k]]] - } else if(!is.logical(keep_pages) || keep_pages == F){ - stop("keep_pages must be either T (to keep all pages) or a list of logical vectors.") + if(!is.null(keep_pages)){ + texts <- texts[sort(unique(keep_pages))] } if(ocr==T){ @@ -71,7 +109,7 @@ pdf_clean <- function(pdfs, keep_pages=T, ocr=F, maxchar=10000, export_paths=NUL if(auto_headfoot_remove == T){ for(pagenum in 1:length(texts)){ - linebreaks <- stringr::str_split(texts[pagenum],"\\n")[[1]] + linebreaks <- str_split(texts[pagenum],"\\n")[[1]] #HEADER if(length(linebreaks)>6){ @@ -81,14 +119,14 @@ pdf_clean <- function(pdfs, keep_pages=T, ocr=F, maxchar=10000, export_paths=NUL #which is row six in linebreaks #and delete it and all the rows above it linebreakhead <- linebreaks[1:6] - emptylines <- which(stringr::str_detect(linebreakhead,"^\\s*$")) + emptylines <- which(str_detect(linebreakhead,"^\\s*$")) if(length(emptylines)>=1){ headercut <- emptylines[length(emptylines)] linebreaks <- linebreaks[(headercut+1):length(linebreaks)] } #if there are no empty lines, don't remove anything }else{ - emptylines <- which(stringr::str_detect(linebreaks,"^\\s*$")) + emptylines <- which(str_detect(linebreaks,"^\\s*$")) if(length(emptylines)>=1){ #just remove everything before the first set of two \\n, headercut <- emptylines[1] @@ -114,7 +152,7 @@ pdf_clean <- function(pdfs, keep_pages=T, ocr=F, maxchar=10000, export_paths=NUL #and delete everything below it #as long as there are four or fewer lines of text after that cut line counter = 0 - emptylines <- which(stringr::str_detect(linebreaks,"^\\s*$")) + emptylines <- which(str_detect(linebreaks,"^\\s*$")) if(length(emptylines)>=1){ emptylinegroups <- sapply(c(1:length(emptylines)), @@ -123,7 +161,7 @@ pdf_clean <- function(pdfs, keep_pages=T, ocr=F, maxchar=10000, export_paths=NUL emptylinegroups <- emptylinegroups[!is.na(emptylinegroups)] footercut <- ifelse(length(emptylinegroups)<3,emptylinegroups[1], emptylinegroups[length(emptylinegroups)-2]) - lineswithtext <- which(stringr::str_detect(linebreaks,"^\\s*$",negate = T)) + lineswithtext <- which(str_detect(linebreaks,"^\\s*$",negate = T)) while(sum(lineswithtext > footercut)>4 & counter <=2){ counter = counter + 1 @@ -147,7 +185,7 @@ pdf_clean <- function(pdfs, keep_pages=T, ocr=F, maxchar=10000, export_paths=NUL #there are no empty lines. texts[pagenum] <- ifelse(is.na(linebreaks[1]), NA, paste(linebreaks, collapse = " ")) - texts[pagenum] <- stringr::str_remove(texts[pagenum],"\\s{3,}([0-9|x|v|i]{1,6}|([a-z]+\\p{Pd}[0-9]+))\\s*$") + texts[pagenum] <- str_remove(texts[pagenum],"\\s{3,}([0-9|x|v|i]{1,6}|([a-z]+\\p{Pd}[0-9]+))\\s*$") #remove page numbers: 5+ spaces followed by (a combo of 1-6 #roman and arabic numerals) or (a letter, hyphen, and #set of numbers such as c-28) at the end of a page diff --git a/R/textnet_extract.R b/R/textnet_extract.R index fe26b2d..89af842 100644 --- a/R/textnet_extract.R +++ b/R/textnet_extract.R @@ -54,6 +54,10 @@ #' #' @import data.table #' @importFrom magrittr %>% +#' @importFrom dplyr group_by filter +#' @importFrom tidyr expand +#' @importFrom pbapply pblapply +#' @importFrom utils data #' @export #' @@ -62,6 +66,39 @@ textnet_extract <- function (x, concatenator = "_",file = NULL,cl = 1, return_to_memory = T, keep_incomplete_edges=F, remove_neg = T) { + # Input validation + if(!is.data.frame(x) && !is.data.table(x)) { + stop("'x' must be a data.frame or data.table") + } + + if(!is.character(concatenator) || length(concatenator) != 1) { + stop("'concatenator' must be a single character string") + } + + if(!is.null(file) && !is.character(file)) { + stop("'file' must be NULL or a character string") + } + + if(!is.numeric(cl) || cl < 1 || cl%%1 != 0) { + stop("'cl' must be a positive integer") + } + + if(!is.character(keep_entities)) { + stop("'keep_entities' must be a character vector") + } + + if(!is.logical(return_to_memory) || length(return_to_memory) != 1) { + stop("'return_to_memory' must be a single logical value") + } + + if(!is.logical(keep_incomplete_edges) || length(keep_incomplete_edges) != 1) { + stop("'keep_incomplete_edges' must be a single logical value") + } + + if(!is.logical(remove_neg) || length(remove_neg) != 1) { + stop("'remove_neg' must be a single logical value") + } + ### note this should be an error if(is.null(file) && return_to_memory == F){stop("function not set to save output OR return object to memory")} x <- data.table::as.data.table(x) @@ -279,6 +316,3 @@ textnet_extract <- function (x, concatenator = "_",file = NULL,cl = 1, if(return_to_memory){return(list('nodelist' = nodelist,'edgelist' = edgelist,'verblist'=verblist,'appositivelist'=apposlist))} } - - - diff --git a/R/top_features.R b/R/top_features.R index c384c28..471fdf8 100644 --- a/R/top_features.R +++ b/R/top_features.R @@ -8,10 +8,24 @@ #' #' @return list of all entities and lemmas in the corpus, along with their average normalized prevalence as a fraction of a plan. For entities, this is the entity degree over the sum of all entity degrees in the plan, averaged across all plans #' @importFrom magrittr %>% +#' @importFrom network network +#' @importFrom ohenery normalize +#' @importFrom tidyr tibble +#' @import dplyr +#' @import igraph #' #' @export top_features <- function(files, from_file=F){ + # Input validation + if(!is.list(files) && !is.character(files)) { + stop("'files' must be either a list of igraph objects or a character vector of file paths") + } + + if(!is.logical(from_file) || length(from_file) != 1) { + stop("'from_file' must be a single logical value") + } + all_lemmas<- vector("list", length = length(files)) all_entities <- vector("list", length = length(files)) @@ -57,4 +71,3 @@ top_features <- function(files, from_file=F){ return(list(entities = all_entity_percents, lemmas = all_lemma_percents)) } - diff --git a/R/utils.R b/R/utils.R index 9f4fb1d..2e83f90 100644 --- a/R/utils.R +++ b/R/utils.R @@ -9,6 +9,11 @@ #' @return list with named empty vectors create_empty_sentence_parse_list <- function(N){ + # Input validation + if(!is.numeric(N) || length(N) != 1 || N < 0 || N %% 1 != 0) { + stop("'N' must be a single non-negative integer") + } + raw_empty = vector(mode = "character", N) parse_list = list(source_or_target = raw_empty, head_verb_id = raw_empty, diff --git a/R/verbnet_port.R b/R/verbnet_port.R index 1348b23..0ec1a6a 100644 --- a/R/verbnet_port.R +++ b/R/verbnet_port.R @@ -9,6 +9,11 @@ #' @param folder_dest A filepath for the folder in which to put the extracted VerbNet files #' #' @import data.table +#' @importFrom utils download.file untar +#' @importFrom R.utils gunzip +#' @importFrom xml2 read_xml xml_attr xml_find_all +#' @importFrom base strsplit unique +#' @importFrom dplyr case_when #' @return Returns the data.table of verbs and their classifications. #' #' @@ -18,6 +23,15 @@ #and folder_dest was set to "data" verbnet_port <- function(zipdestfile, folder_dest){ + # Input validation + if(!is.character(zipdestfile) || length(zipdestfile) != 1) { + stop("'zipdestfile' must be a single character string") + } + + if(!is.character(folder_dest) || length(folder_dest) != 1) { + stop("'folder_dest' must be a single character string") + } + url <- "https://verbs.colorado.edu/verb-index/vn/verbnet-3.3.tar.gz" utils::download.file(url, paste0(zipdestfile,".tar.gz"), method="curl") R.utils::gunzip(paste0(zipdestfile,".tar.gz"))