-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05-A-LD-train-prediction-sfv1.R
85 lines (75 loc) · 2.42 KB
/
05-A-LD-train-prediction-sfv1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#----------------------
# Author: Rintu Kutum
#----------------------
rm(list=ls())
#----------------------
# train embeddings
embedding.model01 <- readr::read_csv(
'data/embeddings/word2vec-spike-all-countries-36-VS_updated.csv'
)
idx.rm <- grep("2021-04",embedding.model01$Collection_Date)
colnames(embedding.model01)[c(2,4)] <- c(
"gisaid_epi_isl", "country")
embedding.model01 <- embedding.model01[-idx.rm,]
colnames(embedding.model01)[5:40] <- as.character(1:36)
idx.dup <- check.duplicate(
embedding.model01$gisaid_epi_isl,
return.value = TRUE)
sf_v1_embedd <- embedding.model01[!idx.dup,colNames]
sf_v1_embedd$ym <- sapply(
sf_v1_embedd$Collection_Date,
getYM
)
sf_v1_embedd$model <- 'sfv1:train'
#----------------------
# prediction embeddings
comb_embedd <- readr::read_csv(
'data/embeddings/combined.csv'
)
colNames <- c(
'gisaid_epi_isl', 'country', 'Collection_Date',
as.character(1:36)
)
comb_embedd_only <- comb_embedd[,colNames]
check.duplicate <- function(x,return.value=FALSE){
idx.dup <- duplicated(x)
print(table(idx.dup))
if(return.value){
return(idx.dup)
}
}
idx.dup <- check.duplicate(comb_embedd_only$gisaid_epi_isl,return.value = TRUE)
dup_gisaid <- comb_embedd_only$gisaid_epi_isl[idx.dup]
dup_loc <- comb_embedd_only$gisaid_epi_isl %in% dup_gisaid
getYM <- function(x){
paste(
strsplit(as.character(x),split = '\\-')[[1]][1:2],
collapse = '-')
}
comb_embedd_only$ym <- sapply(
comb_embedd_only$Collection_Date,
getYM
)
comb_embedd_only <- comb_embedd_only[!idx.dup,]
comb_embedd_only$country[comb_embedd_only$country %in% c("United States","Usa")] <- "USA"
comb_embedd_only$country[comb_embedd_only$country == "belgium"] <- "Belgium"
comb_embedd_only$country[comb_embedd_only$country == "United Kingdom"] <- 'UK'
comb_embedd_only$country <- gsub(' ','-',comb_embedd_only$country)
comb_embedd_only$model <- 'sfv1:prediction'
#----------------------
# train + prediction embeddings of strainflow v1
strainflow_v1 <- rbind(
sf_v1_embedd,
comb_embedd_only
)
idx.dup <- check.duplicate(strainflow_v1$gisaid_epi_isl,TRUE)
dup_gisaid <- unique(strainflow_v1$gisaid_epi_isl[idx.dup])
# remove the duplicates from prediction
idx.rm <- comb_embedd_only$gisaid_epi_isl %in% dup_gisaid
comb_embedd_only <- comb_embedd_only[!idx.rm,]
strainflow_v1 <- rbind(
sf_v1_embedd,
comb_embedd_only
)
colnames(strainflow_v1)[4:39] <- paste0('LD',1:36)
save(strainflow_v1,file='data/latent-space/strainflow_v1.RData')