Merge branch 'dev'

pendy05 · Sep 2, 2024 · 76b611c · 76b611c
2 parents c7c8da2 + 838c10c
commit 76b611c
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 59 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ export(plot_dynamics_proteome)
 export(plot_entropy)
 export(plot_time)
 export(plot_worldmap)
+importFrom(dplyr,"%>%")
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,case_when)
 importFrom(dplyr,distinct)
@@ -23,6 +24,7 @@ importFrom(dplyr,right_join)
 importFrom(dplyr,select)
 importFrom(dplyr,slice)
 importFrom(dplyr,summarise)
+importFrom(dplyr,ungroup)
 importFrom(dplyr,vars)
 importFrom(gghalves,geom_half_boxplot)
 importFrom(gghalves,geom_half_point)

diff --git a/R/metadata_extraction.R b/R/metadata_extraction.R
@@ -1,20 +1,23 @@
 #' Metadata Extraction from NCBI/GISAID EpiCoV FASTA file
 #'
 #' This function retrieves metadata (ID, region, date) from the input FASTA file, with the source of, either 
-#' NCBI (with default FASTA header) or GISAID (with default FASTA header).
+#' NCBI (with default FASTA header) or GISAID (with default FASTA header). The function will return a dataframe
+#' that has three columns consisting ID, collected region and collected date. Records that do not have region or date
+#' information will be excluded from the output dataframe.
 #'
 #' @param file_path path of fasta file
-#' @param source the source of fasta file, either "ncbi" or "GISAID"
+#' @param source the source of fasta file, either "NCBI" or "GISAID"
 #' @return  A dataframe that has three columns consisting ID, collected region and collected date
 #' @examples filepath <- system.file('extdata','GISAID_EpiCoV.faa', package = 'vDiveR')
 #' @examples meta_gisaid <- metadata_extraction(filepath, 'GISAID')
 #' @export
 metadata_extraction <- function(file_path, source){
-    if(source == 'ncbi'){
+    if (tolower(source) == 'ncbi') {
         meta <- extract_from_NCBI(file_path)
-    }
-    if(source == 'GISAID'){
+    } else if (tolower(source) == 'gisaid') {
         meta <- extract_from_GISAID(file_path)
+    } else {
+        stop("Invalid source specified. Please use 'NCBI' or 'GISAID'.")
     }
     return(meta)
 }
@@ -25,22 +28,20 @@ metadata_extraction <- function(file_path, source){
 #' @param file_path path of fasta file
 #' @importFrom stringr str_extract
 extract_from_GISAID <- function(file_path){
-    heads <- c()
-    lines <- readLines(file_path, warn=FALSE)
-    for(line in lines){
-        if(grepl('>', line)){
-            line <- substr(line, 2, nchar(line))
-            heads <- c(heads, line)
-        }
-    }
-    IDs <- c(); regions <- c(); dates <- c()
-    for(head in heads){
-        ID <- str_extract(head, "EPI[^|]*"); IDs <- c(IDs, ID)
-        region <- tryCatch(strsplit(head, '/', fixed=T)[[1]][2], error = function(e) "NA")
-        regions <- c(regions, region)
-        date <- str_extract(head, "[0-9]{4}-[0-9]{2}-[0-9]{2}"); dates <- c(dates, date)
-    }
-    return(data.frame('ID' = IDs, 'region' = regions, 'date' = dates))
+    lines <- readLines(file_path, warn = FALSE)
+    headers <- lines[grepl('^>', lines)]
+
+    #================= Extract metadata from headers =====================#
+    #e.g: >Spike|hCoV-19/Wuhan/WIV04/2019|2019-12-30|EPI_ISL_402124|
+    # id: EPI_ISL_402124
+    # region: Wuhan
+    # date: 2019-12-30
+
+    ids <- str_extract(headers, "EPI[^|]*")
+    regions <- sapply(strsplit(headers, "/"), function(x) if(length(x) > 1) x[2] else NA)
+    dates <- str_extract(headers, "\\d{4}-\\d{2}-\\d{2}")
+
+    return(data.frame(ID = ids, region = regions, date = dates, stringsAsFactors = FALSE))
 }
 
 #' Extract metadata via fasta file from ncbi
@@ -49,47 +50,109 @@ extract_from_GISAID <- function(file_path){
 #' @param file_path path of fasta file
 #' @importFrom rentrez entrez_search entrez_fetch
 extract_from_NCBI <- function(file_path){
-    IDs <- c()
-    lines <- readLines(file_path, warn=FALSE)
-    for(line in lines){
-        if(grepl('>', line)){
-            ID <- strsplit(line, ' ')[[1]][1]
-            ID <- substr(ID,2,nchar(ID))
-            IDs <- c(IDs, ID)
-        }
-    }
-    regions <- c(); dates <- c(); dropsample <- c(); keepsample <- c()
+    #================= Extract metadata from headers =====================#
+    #e.g: >QQL10871.1 Rh230 [Rhesus cytomegalovirus strain 68-1_FL]
+    DATE_FORMATS <- c('%Y-%m-%d', '%d-%B-%Y', '%m/%d/%Y', '%B %d, %Y', '%d-%m-%Y')
+
+    lines <- readLines(file_path, warn=FALSE) # read lines from fasta file into vector
+    # extract id from header
+    headers <- lines[grepl('^>', lines)]
+    headers <- substring(headers, 2)
+    IDs <- sapply(strsplit(headers, ' '), function(x) x[1])
+
+
+    regions <- character()
+    dates <- character()
+    drop_sample_not_found <- character()
+    drop_sample_no_date_or_region <- character()
+    drop_sample_date_format_issue <- character()
+    keep_sample <- character()
+
+    # regions <- c(); dates <- c(); drop_sample <- c(); keep_sample <- c()
     for(ID in IDs){
-        if(substr(ID,1,3) == 'pdb'){
-            dropsample <- c(dropsample, ID)
+        if(startsWith(ID, 'pdb')){
+            drop_sample_not_found <- c(drop_sample_not_found, ID)
             next
         }
-        keepsample <- c(keepsample, ID)
-        search_result <- entrez_search(db = "protein", term = ID, retmax = 1)
-        accession <- search_result$ids[[1]]
-        info <- entrez_fetch(db = "protein", id = accession, rettype = "gb", retmode = "text")
-        info <- tryCatch(strsplit(info, '\n')[[1]], error = function(e) "")
-        idx1 <- grep('/region=',  info); idx2 <- grep('/collection_date=',  info)
-        info1 <- info[idx1]; info2 <- info[idx2]
-        region <- tryCatch(strsplit(info1, '\\"')[[1]][2], error = function(e) "NA")
+
+        # keep_sample <- c(keep_sample, ID)
+        search_result <- rentrez::entrez_search(db = "protein", term = ID, retmax = 1)
+        if(search_result$count == 0){
+            drop_sample_not_found <- c(drop_sample_not_found, ID)
+            next
+        }
+
+        uid <- search_result$ids[1] #get unique identifier
+        # fetch genbank entry
+        genbank_entry <- rentrez::entrez_fetch(db = "protein", id = uid, rettype = "gb", retmode = "text")
+        entry_info <- tryCatch(strsplit(genbank_entry, '\n')[[1]], error = function(e) "")
+
+        # get region and date index
+        region_index <- grep('/country=',  entry_info)
+        date_index <- grep('/collection_date=',  entry_info)
+
+        if (length(region_index) == 0){region_index <- grep('/geo_loc_name=',  entry_info)}
+        if(length(region_index) == 0 | length(date_index) == 0){
+            drop_sample_no_date_or_region <- c(drop_sample_no_date_or_region, ID)
+            next
+        }
+
+        # extract region and date from the identified line
+        region_info <- entry_info[region_index]
+        date_info <- entry_info[date_index]
+
+        region <- tryCatch(strsplit(region_info, '\\"')[[1]][2], error = function(e) "NA")
         if(grepl(':', region)){region <- strsplit(region,':')[[1]][1]}
-        date <- tryCatch(strsplit(info2, '\\"')[[1]][2], error = function(e) "NA")
-        regions <- c(regions, region); dates <- c(dates, date)
+        date <- tryCatch(strsplit(date_info, '\\"')[[1]][2], error = function(e) "NA")
+
+        regions <- c(regions, region)
+        dates <- c(dates, date)
+        keep_sample <- c(keep_sample, ID)
     }
-    tmp <- data.frame('ID' = keepsample, 'region' = regions, 'date' = dates)
-
-    for(i in 1:nrow(tmp)){
-        if(!is.na(as.Date(tmp$date[i],format='%Y-%m-%d'))){
-            tt <- 1
-        } else if(!is.na(as.Date(tmp$date[i],format='%d-%B-%Y'))){
-            tmp$date[i] <- as.character(as.Date(tmp$date[i],format="%d-%B-%Y"))
-        } else {
-            dropsample <- c(dropsample, tmp$ID[i])
+
+    if (length(regions) == 0 | length(dates) == 0) {
+        stop("No valid records found in the input file.")
+    }  
+
+    if (length(drop_sample_not_found) > 0) {
+        drop_sample_not_found <- unique(drop_sample_not_found)
+        warning_info <- paste(c('\nExcluded records:\n',drop_sample_not_found, '\nID not found in the database.'), collapse = " ")
+        warning(warning_info)
+    }
+    if (length(drop_sample_no_date_or_region) > 0) {
+        drop_sample_no_date_or_region <- unique(drop_sample_no_date_or_region)
+        warning_info <- paste(c('\nExcluded records:\n',drop_sample_no_date_or_region, '\nDate and/or region information is not available.'), collapse = " ")
+        warning(warning_info)
+    }    
+
+    result_df <- data.frame('ID' = keep_sample, 'region' = regions, 'date' = dates, stringsAsFactors = FALSE)
+    result_df$date <- sapply(result_df$date, function(d) { # format and validate dates
+        final_date <- NA
+
+        for(fmt in DATE_FORMATS) { # attempt to parse the date with each format
+            parsed_date <- as.Date(d, format = fmt)
+            if (!is.na(parsed_date)) { # if successful, convert to the standard format '%Y-%m-%d'
+                final_date <- as.character(parsed_date)
+                break
+            }
         }
+
+        if (is.na(final_date)) { # if no format matched, add to drop_sample_date_format_issue
+            drop_sample_date_format_issue <- c(drop_sample_date_format_issue, result_df$ID[result_df$date == d])
+        }
+
+        return(final_date)
+    })
+
+    if (length(drop_sample_date_format_issue) > 0) {
+        drop_sample_date_format_issue <- unique(drop_sample_date_format_issue)
+        warning_info <- paste(c('\nExcluded records:\n',drop_sample_date_format_issue, '\nDate in the entry is not in these formats: .', DATE_FORMATS, '\n.'), collapse = " ")
+        warning(warning_info)
     }
-    warninfo <- paste(c('\nExcluded records:\n',dropsample, '\nregion/date (%Y-%m-%d OR %d-%B-%Y format) are not provided.'), collapse = " ")
-    warning(warninfo)
-    tmp <- tmp[! tmp$ID %in% dropsample, ]
-    return(tmp)
+
+    all_drops <- c(drop_sample_not_found, drop_sample_no_date_or_region, drop_sample_date_format_issue)
+    result_df <- result_df[! result_df$ID %in% all_drops, ] # filter the result_df to exclude IDs in all_drops
+
+    return(result_df)
 }
 
diff --git a/R/plot_worldmap.R b/R/plot_worldmap.R
@@ -13,7 +13,7 @@
 #' @examples geographical_plot <- plot_worldmap(metadata)$plot
 #' @examples geographical_df <- plot_worldmap(metadata)$df
 #' @importFrom ggplot2 geom_polygon scale_fill_gradient map_data
-#' @importFrom dplyr left_join %>% groupby ungroup slice 
+#' @importFrom dplyr left_join %>% group_by ungroup slice 
 #' @importFrom stringr str_to_title
 #' @export
 plot_worldmap <- function(meta, base_size=8){

diff --git a/man/metadata_extraction.Rd b/man/metadata_extraction.Rd