-
Notifications
You must be signed in to change notification settings - Fork 23
/
reengineering_phenofile_neale_lab2.r
53 lines (41 loc) · 1.77 KB
/
reengineering_phenofile_neale_lab2.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
library("data.table")
# Neale Lab application.
bd2 <- fread("ukb11214.csv", header=TRUE, sep=',', na.strings=c("NA", ""))
bd2 <- as.data.frame(bd2)
# First get rid of things that are > 4.
colnames(bd2) <- gsub("-", "\\.", colnames(bd2))
visit_number <- as.integer(unlist(lapply(strsplit(names(bd2), "\\."), "[", 2)))[-1]
fields <- unlist(lapply(strsplit(names(bd2), "\\."), "[", 1))[-1]
to_include <- which(visit_number <= 4)
# Get rid of the fields that have up to 31 visits - small collection of Cancer variables,
# need to look at those separately.
# Cancer variables
cancer_fields <- unique(fields[which(visit_number > 4)])
to_include <- setdiff(to_include, which(fields %in% cancer_fields))
bd2 <- bd2[,c(1,to_include+1)]
# Now, want to go through these column names and determine, if there's a 0, include it, if not, look to 1 etc.
fields <- unlist(lapply(strsplit(names(bd2), "\\."), "[", 1))[-1]
visit_number <- as.integer(unlist(lapply(strsplit(names(bd2), "\\."), "[", 2))[-1])
unique_fields <- unique(fields)
fields_to_use <- c()
for (i in unique_fields) {
matches <- which(fields == i)
match_min <- which(visit_number[matches] == min(visit_number[matches]))
match <- matches[match_min]
fields_to_use <- c(fields_to_use, match)
}
bd2 <- bd2[,c(1,fields_to_use+1)]
c1 <- gsub("^", "x", colnames(bd2))
c2 <- gsub("\\.", "_", c1)
c3 <- c("userId", c2[2:length(c2)])
colnames(bd2) <- c3
# Hack to ensure that the age and sex are in as columns.
if (any(colnames(bd2) == "x21022_0_0") == FALSE) {
bd2 <- cbind(rep(1, nrow(bd2)), bd2)
colnames(bd2)[1] <- "x21022_0_0"
}
if (any(colnames(bd2) == "x31_0_0") == FALSE) {
bd2 <- cbind(rep(1, nrow(bd2)), bd2)
colnames(bd2)[1] <- "x31_0_0"
}
fwrite(bd2, file="neale_lab_parsed.tsv", row.names=FALSE, quote=FALSE, sep='\t')