-
Notifications
You must be signed in to change notification settings - Fork 10
/
fctr.R
132 lines (93 loc) · 2.74 KB
/
fctr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#' Factor tests
#'
#' @param data data frame to check factors in
#' @param .names names of variables in `data` to consider.
#'
#' @return check functions 'return' errors and the intent is
#' to return nothing if nothing is wrong,
#' so hopefully nothing is returned.
#'
#' @noRd
#'
fctr_check <- function(data, .names){
chrs <- c()
for( .name in .names ){
if(is.character(data[[.name]])){
chrs <- c(chrs, .name)
}
}
if(is_empty(chrs)) return(NULL)
chrs <- paste_collapse(chrs, last = ' and ')
stop(
"character variables in data should be converted to factors.\n",
"Here are the character variables I detected: ", chrs, call.= FALSE
)
}
#' Factor tests
#'
#' There is a good chance that someone using Surv(time, status) ~ .
#' will forget that inside of the '.' sits an ID variable.
#' Catching that and sending an informative error will likely
#' be appreciated.
#'
#' @param data data frame to check factors in
#' @param .names names of variables in `data` to consider.
#'
#' @return check functions 'return' errors and the intent is
#' to return nothing if nothing is wrong,
#' so hopefully nothing is returned.
#'
#' @noRd
#'
fctr_id_check <- function(data, .names){
for(.name in .names) {
if(is.factor(data[[.name]])){
if(length(levels(data[[.name]])) == nrow(data)){
stop("factor variable ", .name, " has as many levels as there ",
"are rows in the training data. Is ", .name, " an id variable?",
call. = FALSE)
}
}
}
}
#' Factor information
#'
#' @param data data frame to check factors in
#' @param .names names of variables in `data` to consider.
#' @param fctr_sep how to separate factor variable names from levels.
#'
#' @return a list describing factor variables in `data`.
#'
#' @noRd
#'
#' @example
#' fctr_info(pbc_orsf, .names = c('sex','stage'))
fctr_info <- function(data, .names, fctr_sep = '_'){
fctr_check(data, .names)
fctrs <- vector(mode = 'character')
ordrd <- vector(mode = 'logical')
for( .name in .names ){
if(is.factor(data[[.name]])){
fctrs <- c(fctrs, .name)
ordrd <- c(ordrd, is.ordered(data[[.name]]))
}
}
fctr_info <- vector(mode = 'list', length = 4L)
names(fctr_info) <- c('cols', 'lvls', 'keys', 'ordr')
# dont waste time if there aren't any factors
if(is_empty(fctrs)) return(fctr_info)
fctr_info$cols <- fctrs
fctr_info$ordr <- ordrd
fctr_info$lvls <- vector(mode = 'list', length = length(fctrs))
fctr_info$keys <- vector(mode = 'list', length = length(fctrs))
names(fctr_info$lvls) <- fctrs
names(fctr_info$keys) <- fctrs
for( i in seq_along(fctrs) ){
fctr <- fctrs[i]
lvls <- levels(data[[fctr]])
fctr_info$lvls[[fctr]] <- lvls
if(!fctr_info$ordr[i])
fctr_info$keys[[fctr]] <- paste(fctr, lvls, sep = fctr_sep)
}
fctr_info
}