Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
egillax committed Jan 17, 2025
1 parent 31a7db0 commit 272205c
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 23 deletions.
64 changes: 42 additions & 22 deletions R/FeatureEngineering.R
Original file line number Diff line number Diff line change
Expand Up @@ -562,46 +562,56 @@ createNormalizer <- function(type = "minmax") {
minMaxNormalize <- function(trainData, featureEngineeringSettings, normalized = FALSE) {
start <- Sys.time()
if (!normalized) {
outData <- list(
labels = trainData$labels,
folds = trainData$folds,
covariateData = Andromeda::copyAndromeda(trainData$covariateData)
)
ParallelLogger::logInfo("Starting min-max normalization of continuous features")
# fit the normalization
# find continuous features from trainData$covariateData$analysisRef
continousFeatures <- trainData$covariateData$analysisRef %>%
continousFeatures <- outData$covariateData$analysisRef %>%
dplyr::filter(.data$isBinary == "N") %>%
dplyr::select("analysisId") %>%
dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>%
dplyr::inner_join(outData$covariateData$covariateRef, by = "analysisId") %>%
dplyr::pull(.data$covariateId)

# get max of each feature
trainData$covariateData$minMaxs <- trainData$covariateData$covariates %>%
outData$covariateData$minMaxs <- outData$covariateData$covariates %>%
dplyr::filter(.data$covariateId %in% continousFeatures) %>%
dplyr::group_by(.data$covariateId) %>%
dplyr::summarise(
max = max(.data$covariateValue, na.rm = TRUE),
min = min(.data$covariateValue, na.rm = TRUE)
) %>%
dplyr::collect()
on.exit(trainData$covariateData$minMaxs <- NULL, add = TRUE)
on.exit(outData$covariateData$minMaxs <- NULL, add = TRUE)

# save the normalization
attr(featureEngineeringSettings, "minMaxs") <-
trainData$covariateData$minMaxs %>% dplyr::collect()
outData$covariateData$minMaxs %>% dplyr::collect()

# apply the normalization to trainData
trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
dplyr::left_join(trainData$covariateData$minMaxs, by = "covariateId") %>%
outData$covariateData$covariates <- outData$covariateData$covariates %>%
dplyr::left_join(outData$covariateData$minMaxs, by = "covariateId") %>%
# use ifelse to only normalize if min and max are not NA as is the case
# for continous features, else return original value
dplyr::mutate(covariateValue = ifelse(!is.na(min) & !is.na(max),
(.data$covariateValue - min) / (max - min),
.data$covariateValue
)) %>%
dplyr::select(-c("max", "min"))
trainData$covariateData$minMaxs <- NULL
outData$covariateData$minMaxs <- NULL
normalized <- TRUE
} else {
ParallelLogger::logInfo("Applying min-max normalization of continuous features to test data")
outData <- list(
labels = trainData$labels,
folds = trainData$folds,
covariateData = Andromeda::copyAndromeda(trainData$covariateData)
)
# apply the normalization to test data by using saved normalization values
trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
outData$covariateData$covariates <- outData$covariateData$covariates %>%
dplyr::left_join(attr(featureEngineeringSettings, "minMaxs"),
by = "covariateId", copy = TRUE
) %>%
Expand All @@ -619,14 +629,14 @@ minMaxNormalize <- function(trainData, featureEngineeringSettings, normalized =
)
)

attr(trainData$covariateData, "metaData")$featureEngineering[["minMaxNormalize"]] <-
attr(outData$covariateData, "metaData")$featureEngineering[["minMaxNormalize"]] <-
featureEngineering
delta <- Sys.time() - start
ParallelLogger::logInfo(paste0(
"Finished min-max normalization of continuous features in ",
signif(delta, 3), " ", attr(delta, "units")
))
return(trainData)
return(outData)
}

#' A function that normalizes continous by the interquartile range and forces
Expand All @@ -645,19 +655,24 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized =
start <- Sys.time()
if (!normalized) {
ParallelLogger::logInfo("Starting robust normalization of continuous features")
outData <- list(
labels = trainData$labels,
folds = trainData$folds,
covariateData = Andromeda::copyAndromeda(trainData$covariateData)
)
# find continuous features from trainData$covariateData$analysisRef
continousFeatures <- trainData$covariateData$analysisRef %>%
continousFeatures <- outData$covariateData$analysisRef %>%
dplyr::filter(.data$isBinary == "N") %>%
dplyr::select("analysisId") %>%
dplyr::inner_join(trainData$covariateData$covariateRef, by = "analysisId") %>%
dplyr::inner_join(outData$covariateData$covariateRef, by = "analysisId") %>%
dplyr::pull(.data$covariateId)

# get (25, 75)% quantiles of each feature
# sqlite (used by Andromeda) doesn't have quantile function, so we need to load the extension
# to get upper_quartile and lower_quartile_functions
RSQLite::initExtension(trainData$covariateData, "math")
RSQLite::initExtension(outData$covariateData, "math")

trainData$covariateData$quantiles <- trainData$covariateData$covariates %>%
outData$covariateData$quantiles <- outData$covariateData$covariates %>%
dplyr::filter(.data$covariateId %in% continousFeatures) %>%
dplyr::group_by(.data$covariateId) %>%
dplyr::summarise(
Expand All @@ -668,15 +683,15 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized =
dplyr::mutate(iqr = .data$q75 - .data$q25) %>%
dplyr::select(-c("q75", "q25")) %>%
dplyr::collect()
on.exit(trainData$covariateData$quantiles <- NULL, add = TRUE)
on.exit(outData$covariateData$quantiles <- NULL, add = TRUE)

# save the normalization
attr(featureEngineeringSettings, "quantiles") <-
trainData$covariateData$quantiles %>% dplyr::collect()
outData$covariateData$quantiles %>% dplyr::collect()

# apply the normalization to trainData
trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
dplyr::left_join(trainData$covariateData$quantiles, by = "covariateId") %>%
outData$covariateData$covariates <- outData$covariateData$covariates %>%
dplyr::left_join(outData$covariateData$quantiles, by = "covariateId") %>%
# use ifelse to only normalize continous features
dplyr::mutate(covariateValue = ifelse(!is.na(.data$iqr) & !is.na(.data$median),
(.data$covariateValue - .data$median) / .data$iqr,
Expand All @@ -692,8 +707,13 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized =
normalized <- TRUE
} else {
ParallelLogger::logInfo("Applying robust normalization of continuous features to test data")
outData <- list(
labels = trainData$labels,
folds = trainData$folds,
covariateData = Andromeda::copyAndromeda(trainData$covariateData)
)
# apply the normalization to test data by using saved normalization values
trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
outData$covariateData$covariates <- outData$covariateData$covariates %>%
dplyr::left_join(attr(featureEngineeringSettings, "quantiles"),
by = "covariateId", copy = TRUE
) %>%
Expand All @@ -715,14 +735,14 @@ robustNormalize <- function(trainData, featureEngineeringSettings, normalized =
)
)

attr(trainData$covariateData, "metaData")$featureEngineering[["robustNormalize"]] <-
attr(outData$covariateData, "metaData")$featureEngineering[["robustNormalize"]] <-
featureEngineering
delta <- Sys.time() - start
ParallelLogger::logInfo(paste0(
"Finished robust normalization in ",
signif(delta, 3), " ", attr(delta, "units")
))
return(trainData)
return(outData)
}

#' Create the settings for removing rare features
Expand Down
10 changes: 9 additions & 1 deletion tests/testthat/test-featureEngineering.R
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,10 @@ test_that("normalization works", {
testFeature <- testNormalizedData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
expect_true(all(testFeature >= 0) && all(testFeature <= 1))
trainMin <- min(normalizedData$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue))
trainMax <- max(normalizedData$covariateData$covariates %>% dplyr::filter(.data$covariateId == 12101) %>% dplyr::pull(.data$covariateValue))
testNormFeature <- (testFeature - trainMin) / (trainMax - trainMin)
expect_equal(testFeature, testNormFeature)

normalizer <- createNormalizer(type = "robust")
data <- addFeature(tinyTrainData, 12101, -10, 10)
Expand All @@ -377,8 +380,13 @@ test_that("normalization works", {
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
expect_true(all(feature >= -3) && all(feature <= 3))
trainFeature <- data$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
testFeature <- newTestData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == 12101) %>%
dplyr::pull(.data$covariateValue)
testNormFeature <- (testFeature - median(trainFeature)) / IQR(trainFeature)
testNormFeature <- testNormFeature / sqrt(1 + (testNormFeature / 2)^2)
expect_true(all(testFeature >= -3) && all(testFeature <= 3))
})

0 comments on commit 272205c

Please sign in to comment.