Skip to content

Commit

Permalink
Merge pull request #9 from ucd-cepb/tyler-edits
Browse files Browse the repository at this point in the history
Tyler edits
  • Loading branch information
tylerandrewscott authored Nov 7, 2024
2 parents 66d3951 + eae3329 commit 7297895
Show file tree
Hide file tree
Showing 15 changed files with 368 additions and 82 deletions.
50 changes: 34 additions & 16 deletions R/clean_entities.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,55 +15,73 @@
#' @param concatenator Defaults to an underscore. Use regex notation. The concatenator used in elements of v.
#' @return a cleaned vector of entity names

#' @importFrom stringr str_remove_all
#' @importFrom stringi stri_replace_all_regex
#' @export
#'

clean_entities <- function(v, remove_nums=T, remove_trailing_s=T, concatenator = "_"){
# Input validation
if (!is.vector(v)) {
stop("'v' must be a vector")
}

if (!is.logical(remove_nums)) {
stop("'remove_nums' must be a boolean value (TRUE/FALSE)")
}

if (!is.logical(remove_trailing_s)) {
stop("'remove_trailing_s' must be a boolean value (TRUE/FALSE)")
}

if (!is.character(concatenator) || length(concatenator) != 1) {
stop("'concatenator' must be a single character string")
}

#format math font as regular font
#unicode of math font alphabet
maths <- c("\\U0001d44e", "\\U0001d44f", "\\U0001d450", "\\U0001d451", "\\U0001d452",
"\\U0001d453", "\\U0001d454", "\\u210e", "\\U0001d456", "\\U0001d457",
"\\U0001d458", "\\U0001d459", "\\U0001d45a", "\\U0001d45b", "\\U0001d45c",
"\\U0001d45d", "\\U0001d45e", "\\U0001d45f", "\\U0001d460", "\\U0001d461",
"\\U0001d462", "\\U0001d463", "\\U0001d464", "\\U0001d465", "\\U0001d466",
"\\U0001d467", "\\U0001d434", "\\U0001d435", "\\U0001d436", "\\U0001d437",
"\\U0001d438", "\\U0001d439", "\\U0001d43a", "\\U0001d43b", "\\U0001d43c",
"\\U0001d43d", "\\U0001d43e", "\\U0001d43f", "\\U0001d440", "\\U0001d441",
"\\U0001d442", "\\U0001d443", "\\U0001d444", "\\U0001d445", "\\U0001d446",
"\\U0001d447", "\\U0001d448", "\\U0001d449", "\\U0001d44a", "\\U0001d44b",
"\\U0001d44c", "\\U0001d44d")
"\\U0001d453", "\\U0001d454", "\\u210e", "\\U0001d456", "\\U0001d457",
"\\U0001d458", "\\U0001d459", "\\U0001d45a", "\\U0001d45b", "\\U0001d45c",
"\\U0001d45d", "\\U0001d45e", "\\U0001d45f", "\\U0001d460", "\\U0001d461",
"\\U0001d462", "\\U0001d463", "\\U0001d464", "\\U0001d465", "\\U0001d466",
"\\U0001d467", "\\U0001d434", "\\U0001d435", "\\U0001d436", "\\U0001d437",
"\\U0001d438", "\\U0001d439", "\\U0001d43a", "\\U0001d43b", "\\U0001d43c",
"\\U0001d43d", "\\U0001d43e", "\\U0001d43f", "\\U0001d440", "\\U0001d441",
"\\U0001d442", "\\U0001d443", "\\U0001d444", "\\U0001d445", "\\U0001d446",
"\\U0001d447", "\\U0001d448", "\\U0001d449", "\\U0001d44a", "\\U0001d44b",
"\\U0001d44c", "\\U0001d44d")
letts <- c(letters,LETTERS)

v <- pbapply::pblapply(1:length(v), function(i){
stringi::stri_replace_all_regex(v[i], pattern = maths,
v <- lapply(1:length(v), function(i){
stri_replace_all_regex(v[i], pattern = maths,
replacement = letts,
vectorize= F)
})

if(remove_trailing_s==T){
#remove strings with specific placement: trailing "'s"
index <- which(grepl("'s$",v,perl = T))
v[index] <- stringr::str_remove_all(v[index],"'s$")
v[index] <- str_remove_all(v[index],"'s$")

#remove strings with specific placement: trailing concatenator + s, e.g. "_s"
index <- which(grepl(paste0(concatenator,"s$"),v,perl = T))
v[index] <- stringr::str_remove_all(v[index],paste0(concatenator,"s$"))
v[index] <- str_remove_all(v[index],paste0(concatenator,"s$"))

}

#next, remove all non-word characters
remove <- c("\\W")
index <- which(grepl(paste(remove,collapse = '|'),v,perl = T))
v[index] <- stringr::str_remove_all(v[index],paste(remove,collapse = '|'))
v[index] <- str_remove_all(v[index],paste(remove,collapse = '|'))

#remove consecutive underscores that may have arisen due to previous cleaning step
v <- gsub(paste0('(',concatenator,')\\1+'), '\\1', v)

#remove leading or trailing underscores that may have arisen due to previous cleaning steps
remove <- c(paste0("^",concatenator), paste0(concatenator, "$"))
index <- which(grepl(paste(remove,collapse = '|'),v,perl = T))
v[index] <- stringr::str_remove_all(v[index],paste(remove,collapse = '|'))
v[index] <- str_remove_all(v[index],paste(remove,collapse = '|'))

#remove entities that have no letters (or numbers, if remove_nums == F)
if(remove_nums){
Expand Down
46 changes: 38 additions & 8 deletions R/combine_networks.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,54 @@
#' @param mode Either "multiplex" or "weighted" depending on the desired exported network format. If "weighted", collapses multiplex edges into a single weighted edge
#'
#' @return Single igraph object that consolidates nodes and edges from input graphs. If there are multiple nodes with the same name and different attributes originating from different graphs, this function preserves the node attributes associated with the version that appears most commonly. Adds a node attribute num_graphs_in, which denotes the number of input graphs each node was found in. For a weighted graph, the weight is equal to the original number of edges between the respective source and target nodes. Edge attributes for a multiplex graph are described in the help file of textnet_extract.
#' @import data.table
#' @importFrom data.table rbindlist
#' @import igraph
#' @importFrom methods is
#' @export
#'

combine_networks <- function(textnet_igraphs, mode){
combine_networks <- function(textnet_igraphs, mode = c('multiplex','weighted')){
# Check if textnet_igraphs is missing
if(missing(textnet_igraphs)) {
stop("Argument 'textnet_igraphs' is missing. Must provide a list of igraph objects.")
}

# Check if textnet_igraphs is a list
if(!is.list(textnet_igraphs)){
stop("please format your graphs as elements of a list.")
stop("Argument 'textnet_igraphs' must be a list of igraph objects.")
}
num_graphs <- length(textnet_igraphs)
if(sum(sapply(1:length(textnet_igraphs), function(i) !is(textnet_igraphs[[i]], "igraph")))>0){
stop("combine_networks only accepts igraph objects.")

# Check if list is empty
if(length(textnet_igraphs) == 0) {
stop("Argument 'textnet_igraphs' cannot be an empty list.")
}
if(!mode %in% c("multiplex","weighted")){
stop("Mode must be either multiplex or weighted.")

# Check if all elements are igraph objects
if(sum(sapply(textnet_igraphs, function(x) !is(x, "igraph"))) > 0){
stop("All elements in 'textnet_igraphs' must be igraph objects.")
}

# Check if mode is missing
if(missing(mode)) {
stop("Argument 'mode' is missing. Must be either 'multiplex' or 'weighted'.")
}

# Check if mode is character
if(!is.character(mode)) {
stop("Argument 'mode' must be a character string ('multiplex' or 'weighted').")
}

# Check if mode is length 1
if(length(mode) != 1) {
stop("Argument 'mode' must be a single value ('multiplex' or 'weighted').")
}

# Check if mode has valid value
if(!mode %in% c("multiplex","weighted")){
stop("Argument 'mode' must be either 'multiplex' or 'weighted'.")
}

num_graphs <- length(textnet_igraphs)
supernodes <- vector(mode = "list", length = length(num_graphs))
superedges <- vector(mode = "list", length = length(num_graphs))
for(m in 1:num_graphs){
Expand Down
20 changes: 18 additions & 2 deletions R/crawl_sentence.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,26 @@
#'
#' @param s a data.frame containing the results of one (1) parsed spacy sentence
#' @return list with original parsed sentence + added dependency parsing
#'
#' @importFrom dplyr case_when
#'

crawl_sentence <- function(s){
# Input validation
if (!is.data.frame(s)) {
stop("'s' must be a data.frame")
}

if (nrow(s) == 0) {
stop("'s' must contain at least one row")
}

# Required columns
required_cols <- c("pos", "dep_rel", "head_token_id", "lemma", "token", "token_id", "tag")
missing_cols <- required_cols[!required_cols %in% names(s)]
if (length(missing_cols) > 0) {
stop(paste0("Missing required columns in 's': ", paste(missing_cols, collapse=", ")))
}

sentence <- s
empty_list <- create_empty_sentence_parse_list(nrow(sentence))
#tag auxes acting as auxes (eg "will" in "will attempt") for removal
Expand Down Expand Up @@ -51,7 +67,7 @@ empty_list$xcomp_verb <- ifelse(sentence$pos=="VERB" & sentence$dep_rel =="xcomp
#this categorizes each word as either source or target and
#saves it as a new column. Identifies head_verb_id and saves it as a new column
while(is.na(empty_list$source_or_target[tok_num]) & break_while_counter < 15){
empty_list$source_or_target[tok_num] <- dplyr::case_when(
empty_list$source_or_target[tok_num] <- case_when(
#if head_token_id trail traces back to an appositive before hitting anything else --> NA
#if you find an appositive, stop -- it's a duplicate and should not be counted
sentence[current_token_id,dep_rel] %in% c("appos") ~ "appos",
Expand Down
28 changes: 28 additions & 0 deletions R/disambiguate.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,41 @@
#' @return a cleaned textnet extract. See textnet_extract help file for structure.
#'
#' @import data.table
#' @importFrom stringr str_detect str_remove_all str_replace_all
#' @importFrom dplyr arrange desc filter
#' @importFrom magrittr %>%
#' @importFrom methods is
#' @export
#'

#if recursive is true, runs it multiple times to reach the end of the chain.

disambiguate <- function(textnet_extract, from, to, match_partial_entity=rep(F, length(from)), try_drop=NULL, recursive=T, concatenator="_"){
# Input validation
if(!is.list(from)) {
stop("'from' must be a list")
}

if(!is.list(to)) {
stop("'to' must be a list")
}

if(!is.logical(match_partial_entity)) {
stop("'match_partial_entity' must be a logical vector")
}

if(!is.null(try_drop) && !is.character(try_drop)) {
stop("'try_drop' must be NULL or a character vector")
}

if(!is.logical(recursive)) {
stop("'recursive' must be a logical value")
}

if(!is.character(concatenator) || length(concatenator) != 1) {
stop("'concatenator' must be a single character string")
}

options(warn=1)
#Data formatting checks####
multi_to <- sapply(1:length(to), function(w) length(to[[w]]) > 1)
Expand Down
5 changes: 3 additions & 2 deletions R/download_dictionary.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

#' Downloads the SCOWL 2020.12.07 dictionary created by Kevin Atkinson, which has a larger vocabulary than Qdap
#' Code adapted from Mirek Długosz 2016 https://mirekdlugosz.com/blog/2016/how-to-use-r-to-recognize-if-given-string-is-a-word/

#' @importFrom utils download.file unzip
#' @importFrom tools file_ext
#' @importFrom stringi stri_read_lines stri_escape_unicode

download_dictionary <- function(){

Expand Down Expand Up @@ -33,4 +35,3 @@ download_dictionary <- function(){
save(eng_words, file = "data/eng_words.rda")

}

23 changes: 18 additions & 5 deletions R/entity_consolidate_replicate.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,34 @@
#' Instead, this function basically breaks off a feature of the original function and adds a new column to the original spacyr data.frame that is the concatenated entity.
#' This feature is inefficient in that the concatenated entity is then replicated multiple times, but this does seem to be the easiest way to preserve the other data.
#'
#' @param x parsed spacy document
#' @param x parsed spacy document in data.frame format
#' @param concatenator A character that separates string segments when they are collapsed into a single entity. Defaults to "_"
#' @param remove regex formatted strings to remove as entity components (like "the" in "the Seattle Supersonics")
#' @return original data frame with added column for concatenated entity
#'
#' @import data.table
#' @importFrom stringr str_remove_all
#'

entity_consolidate_replicate <- function(x, concatenator = "_",remove = NULL) {
#Remove tokens that have no alphabet characters
entity_consolidate_replicate <- function(x, concatenator = "_", remove = NULL) {
# Input validation
if (!is.data.frame(x)) {
stop("'x' must be a data.frame")
}

if (!is.character(concatenator) || length(concatenator) != 1) {
stop("'concatenator' must be a single character string")
}

spacy_result <- data.table::as.data.table(x)
if (!is.null(remove) && !is.character(remove)) {
stop("'remove' must be NULL or a character vector")
}

#Remove tokens that have no alphabet characters
spacy_result <- as.data.table(x)
if(!is.null(remove)){
index <- which(grepl(paste(remove,collapse = '|'),spacy_result$token,perl = T)&spacy_result$entity!="")
spacy_result$token[index] <- stringr::str_remove_all(spacy_result$token[index],paste(remove,collapse = '|'))
spacy_result$token[index] <- str_remove_all(spacy_result$token[index],paste(remove,collapse = '|'))
spacy_result$entity[index] <- ""
}

Expand Down
30 changes: 25 additions & 5 deletions R/export_to_network.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,41 @@
#' \item num_communities -- number of communities using louvain cluster algorithm on a weighted, undirected, non-multiplex version of the network
#' \item percent_vbn, percent_vbg, percent_vpb, percent_vbd, percent_vb, percent_vbz -- percent of edges in the graph that are of the respective verb tense
#' }
#' @import igraph
#' @importFrom dplyr filter
#' @importFrom network network network.size network.edgecount network.density
#' @importFrom sna connectedness centralization gtrans
#' @importFrom stats median
#' @importFrom base unique
#' @export
#'

export_to_network <- function(textnet_extract, export_format, keep_isolates=T, collapse_edges, self_loops){
# Input validation
if(!is.list(textnet_extract)) {
stop("'textnet_extract' must be a list")
}

if(!is.character(export_format) || length(export_format) != 1) {
stop("'export_format' must be a single character string")
}

if(!export_format %in% c("igraph","network")){
stop("export_format must be either 'igraph' or 'network'")
}
if(!keep_isolates %in% c(T,F)){
stop("keep_isolates must be either T or F.")

if(!is.logical(keep_isolates) || length(keep_isolates) != 1) {
stop("'keep_isolates' must be a single logical value")
}
if(!collapse_edges %in% c(T,F)){
stop("collapse_edges must be either T or F.")

if(!is.logical(collapse_edges) || length(collapse_edges) != 1) {
stop("'collapse_edges' must be a single logical value")
}

if(!is.logical(self_loops) || length(self_loops) != 1) {
stop("'self_loops' must be a single logical value")
}

textnet_extract$edgelist <- dplyr::filter(textnet_extract$edgelist, !is.na(textnet_extract$edgelist$source) & !is.na(textnet_extract$edgelist$target))


Expand Down Expand Up @@ -186,4 +207,3 @@ export_to_network <- function(textnet_extract, export_format, keep_isolates=T, c

}


27 changes: 21 additions & 6 deletions R/filter_sentences.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,35 @@
#' @param case_sensitive Whether the token is required to have the specific casing used in the dictionary to count as a match. Defaults to F.
#' @return a cleaned version of 'file', keeping only the sentences that pass the threshold requirement.
#' @importFrom stats aggregate
#' @importFrom dplyr left_join

#' @export
#'

filter_sentences <- function(file, filter = textNet::eng_words,
percent_threshold = 40, case_sensitive = F){
#TODO class type checks
if(case_sensitive == F){
# Input validation
if(!is.data.frame(file)) {
stop("'file' must be a data frame")
}

if(!is.character(filter)) {
stop("'filter' must be a character vector")
}

if(!is.numeric(percent_threshold) || percent_threshold < 0 || percent_threshold > 100) {
stop("'percent_threshold' must be a numeric value between 0 and 100")
}

if(!is.logical(case_sensitive)) {
stop("'case_sensitive' must be a logical value (TRUE/FALSE)")
}

if(!case_sensitive){
filter <- tolower(filter)
tokens <- tolower(file$token)
}else if(case_sensitive ==T){
}else if(case_sensitive){
tokens <- file$token
}else{
stop("case_sensitive must be T or F.")
}
eng <- tokens %in% filter
percent_pass_filter <- aggregate(eng,
Expand All @@ -39,6 +54,6 @@ filter_sentences <- function(file, filter = textNet::eng_words,
percent_pass_filter$x <- NULL
colnames(percent_pass_filter) <- c("sentence_id", "doc_id")

file <- dplyr::left_join(percent_pass_filter, file)
file <- left_join(percent_pass_filter, file)
return(file)
}
Loading

0 comments on commit 7297895

Please sign in to comment.