Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
pendy05 committed Sep 2, 2024
2 parents c7c8da2 + 838c10c commit 76b611c
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 59 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(plot_dynamics_proteome)
export(plot_entropy)
export(plot_time)
export(plot_worldmap)
importFrom(dplyr,"%>%")
importFrom(dplyr,bind_rows)
importFrom(dplyr,case_when)
importFrom(dplyr,distinct)
Expand All @@ -23,6 +24,7 @@ importFrom(dplyr,right_join)
importFrom(dplyr,select)
importFrom(dplyr,slice)
importFrom(dplyr,summarise)
importFrom(dplyr,ungroup)
importFrom(dplyr,vars)
importFrom(gghalves,geom_half_boxplot)
importFrom(gghalves,geom_half_point)
Expand Down
175 changes: 119 additions & 56 deletions R/metadata_extraction.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
#' Metadata Extraction from NCBI/GISAID EpiCoV FASTA file
#'
#' This function retrieves metadata (ID, region, date) from the input FASTA file, with the source of, either
#' NCBI (with default FASTA header) or GISAID (with default FASTA header).
#' NCBI (with default FASTA header) or GISAID (with default FASTA header). The function will return a dataframe
#' that has three columns consisting ID, collected region and collected date. Records that do not have region or date
#' information will be excluded from the output dataframe.
#'
#' @param file_path path of fasta file
#' @param source the source of fasta file, either "ncbi" or "GISAID"
#' @param source the source of fasta file, either "NCBI" or "GISAID"
#' @return A dataframe that has three columns consisting ID, collected region and collected date
#' @examples filepath <- system.file('extdata','GISAID_EpiCoV.faa', package = 'vDiveR')
#' @examples meta_gisaid <- metadata_extraction(filepath, 'GISAID')
#' @export
metadata_extraction <- function(file_path, source){
if(source == 'ncbi'){
if (tolower(source) == 'ncbi') {
meta <- extract_from_NCBI(file_path)
}
if(source == 'GISAID'){
} else if (tolower(source) == 'gisaid') {
meta <- extract_from_GISAID(file_path)
} else {
stop("Invalid source specified. Please use 'NCBI' or 'GISAID'.")
}
return(meta)
}
Expand All @@ -25,22 +28,20 @@ metadata_extraction <- function(file_path, source){
#' @param file_path path of fasta file
#' @importFrom stringr str_extract
extract_from_GISAID <- function(file_path){
heads <- c()
lines <- readLines(file_path, warn=FALSE)
for(line in lines){
if(grepl('>', line)){
line <- substr(line, 2, nchar(line))
heads <- c(heads, line)
}
}
IDs <- c(); regions <- c(); dates <- c()
for(head in heads){
ID <- str_extract(head, "EPI[^|]*"); IDs <- c(IDs, ID)
region <- tryCatch(strsplit(head, '/', fixed=T)[[1]][2], error = function(e) "NA")
regions <- c(regions, region)
date <- str_extract(head, "[0-9]{4}-[0-9]{2}-[0-9]{2}"); dates <- c(dates, date)
}
return(data.frame('ID' = IDs, 'region' = regions, 'date' = dates))
lines <- readLines(file_path, warn = FALSE)
headers <- lines[grepl('^>', lines)]

#================= Extract metadata from headers =====================#
#e.g: >Spike|hCoV-19/Wuhan/WIV04/2019|2019-12-30|EPI_ISL_402124|
# id: EPI_ISL_402124
# region: Wuhan
# date: 2019-12-30

ids <- str_extract(headers, "EPI[^|]*")
regions <- sapply(strsplit(headers, "/"), function(x) if(length(x) > 1) x[2] else NA)
dates <- str_extract(headers, "\\d{4}-\\d{2}-\\d{2}")

return(data.frame(ID = ids, region = regions, date = dates, stringsAsFactors = FALSE))
}

#' Extract metadata via fasta file from ncbi
Expand All @@ -49,47 +50,109 @@ extract_from_GISAID <- function(file_path){
#' @param file_path path of fasta file
#' @importFrom rentrez entrez_search entrez_fetch
extract_from_NCBI <- function(file_path){
IDs <- c()
lines <- readLines(file_path, warn=FALSE)
for(line in lines){
if(grepl('>', line)){
ID <- strsplit(line, ' ')[[1]][1]
ID <- substr(ID,2,nchar(ID))
IDs <- c(IDs, ID)
}
}
regions <- c(); dates <- c(); dropsample <- c(); keepsample <- c()
#================= Extract metadata from headers =====================#
#e.g: >QQL10871.1 Rh230 [Rhesus cytomegalovirus strain 68-1_FL]
DATE_FORMATS <- c('%Y-%m-%d', '%d-%B-%Y', '%m/%d/%Y', '%B %d, %Y', '%d-%m-%Y')

lines <- readLines(file_path, warn=FALSE) # read lines from fasta file into vector
# extract id from header
headers <- lines[grepl('^>', lines)]
headers <- substring(headers, 2)
IDs <- sapply(strsplit(headers, ' '), function(x) x[1])


regions <- character()
dates <- character()
drop_sample_not_found <- character()
drop_sample_no_date_or_region <- character()
drop_sample_date_format_issue <- character()
keep_sample <- character()

# regions <- c(); dates <- c(); drop_sample <- c(); keep_sample <- c()
for(ID in IDs){
if(substr(ID,1,3) == 'pdb'){
dropsample <- c(dropsample, ID)
if(startsWith(ID, 'pdb')){
drop_sample_not_found <- c(drop_sample_not_found, ID)
next
}
keepsample <- c(keepsample, ID)
search_result <- entrez_search(db = "protein", term = ID, retmax = 1)
accession <- search_result$ids[[1]]
info <- entrez_fetch(db = "protein", id = accession, rettype = "gb", retmode = "text")
info <- tryCatch(strsplit(info, '\n')[[1]], error = function(e) "")
idx1 <- grep('/region=', info); idx2 <- grep('/collection_date=', info)
info1 <- info[idx1]; info2 <- info[idx2]
region <- tryCatch(strsplit(info1, '\\"')[[1]][2], error = function(e) "NA")

# keep_sample <- c(keep_sample, ID)
search_result <- rentrez::entrez_search(db = "protein", term = ID, retmax = 1)
if(search_result$count == 0){
drop_sample_not_found <- c(drop_sample_not_found, ID)
next
}

uid <- search_result$ids[1] #get unique identifier
# fetch genbank entry
genbank_entry <- rentrez::entrez_fetch(db = "protein", id = uid, rettype = "gb", retmode = "text")
entry_info <- tryCatch(strsplit(genbank_entry, '\n')[[1]], error = function(e) "")

# get region and date index
region_index <- grep('/country=', entry_info)
date_index <- grep('/collection_date=', entry_info)

if (length(region_index) == 0){region_index <- grep('/geo_loc_name=', entry_info)}
if(length(region_index) == 0 | length(date_index) == 0){
drop_sample_no_date_or_region <- c(drop_sample_no_date_or_region, ID)
next
}

# extract region and date from the identified line
region_info <- entry_info[region_index]
date_info <- entry_info[date_index]

region <- tryCatch(strsplit(region_info, '\\"')[[1]][2], error = function(e) "NA")
if(grepl(':', region)){region <- strsplit(region,':')[[1]][1]}
date <- tryCatch(strsplit(info2, '\\"')[[1]][2], error = function(e) "NA")
regions <- c(regions, region); dates <- c(dates, date)
date <- tryCatch(strsplit(date_info, '\\"')[[1]][2], error = function(e) "NA")

regions <- c(regions, region)
dates <- c(dates, date)
keep_sample <- c(keep_sample, ID)
}
tmp <- data.frame('ID' = keepsample, 'region' = regions, 'date' = dates)

for(i in 1:nrow(tmp)){
if(!is.na(as.Date(tmp$date[i],format='%Y-%m-%d'))){
tt <- 1
} else if(!is.na(as.Date(tmp$date[i],format='%d-%B-%Y'))){
tmp$date[i] <- as.character(as.Date(tmp$date[i],format="%d-%B-%Y"))
} else {
dropsample <- c(dropsample, tmp$ID[i])

if (length(regions) == 0 | length(dates) == 0) {
stop("No valid records found in the input file.")
}

if (length(drop_sample_not_found) > 0) {
drop_sample_not_found <- unique(drop_sample_not_found)
warning_info <- paste(c('\nExcluded records:\n',drop_sample_not_found, '\nID not found in the database.'), collapse = " ")
warning(warning_info)
}
if (length(drop_sample_no_date_or_region) > 0) {
drop_sample_no_date_or_region <- unique(drop_sample_no_date_or_region)
warning_info <- paste(c('\nExcluded records:\n',drop_sample_no_date_or_region, '\nDate and/or region information is not available.'), collapse = " ")
warning(warning_info)
}

result_df <- data.frame('ID' = keep_sample, 'region' = regions, 'date' = dates, stringsAsFactors = FALSE)
result_df$date <- sapply(result_df$date, function(d) { # format and validate dates
final_date <- NA

for(fmt in DATE_FORMATS) { # attempt to parse the date with each format
parsed_date <- as.Date(d, format = fmt)
if (!is.na(parsed_date)) { # if successful, convert to the standard format '%Y-%m-%d'
final_date <- as.character(parsed_date)
break
}
}

if (is.na(final_date)) { # if no format matched, add to drop_sample_date_format_issue
drop_sample_date_format_issue <- c(drop_sample_date_format_issue, result_df$ID[result_df$date == d])
}

return(final_date)
})

if (length(drop_sample_date_format_issue) > 0) {
drop_sample_date_format_issue <- unique(drop_sample_date_format_issue)
warning_info <- paste(c('\nExcluded records:\n',drop_sample_date_format_issue, '\nDate in the entry is not in these formats: .', DATE_FORMATS, '\n.'), collapse = " ")
warning(warning_info)
}
warninfo <- paste(c('\nExcluded records:\n',dropsample, '\nregion/date (%Y-%m-%d OR %d-%B-%Y format) are not provided.'), collapse = " ")
warning(warninfo)
tmp <- tmp[! tmp$ID %in% dropsample, ]
return(tmp)

all_drops <- c(drop_sample_not_found, drop_sample_no_date_or_region, drop_sample_date_format_issue)
result_df <- result_df[! result_df$ID %in% all_drops, ] # filter the result_df to exclude IDs in all_drops

return(result_df)
}

2 changes: 1 addition & 1 deletion R/plot_worldmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' @examples geographical_plot <- plot_worldmap(metadata)$plot
#' @examples geographical_df <- plot_worldmap(metadata)$df
#' @importFrom ggplot2 geom_polygon scale_fill_gradient map_data
#' @importFrom dplyr left_join %>% groupby ungroup slice
#' @importFrom dplyr left_join %>% group_by ungroup slice
#' @importFrom stringr str_to_title
#' @export
plot_worldmap <- function(meta, base_size=8){
Expand Down
6 changes: 4 additions & 2 deletions man/metadata_extraction.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 76b611c

Please sign in to comment.