diff --git a/NAMESPACE b/NAMESPACE index c23d30b02..65e133512 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(addMultipleDiagnosePlpToDatabase) export(addMultipleRunPlpToDatabase) export(addRunPlpToDatabase) export(averagePrecision) +export(borutaSettings) export(brierScore) export(calibrationLine) export(computeAuc) @@ -34,7 +35,6 @@ export(createSampleSettings) export(createStudyPopulation) export(createStudyPopulationSettings) export(createTempModelLoc) -export(createUnivariateFeatureSelection) export(createValidationSettings) export(diagnoseMultiplePlp) export(diagnosePlp) @@ -105,13 +105,13 @@ export(setPythonEnvironment) export(setRandomForest) export(setRidgeRegression) export(setSVM) -export(setStepwiseSelection) -export(setUnivariateSelection) export(simulatePlpData) export(sklearnFromJson) export(sklearnToJson) export(splitData) +export(stepwiseSettings) export(toSparseM) +export(univariateSettings) export(validateMultiplePlp) export(viewDatabaseResultPlp) export(viewMultiplePlp) diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index d9b3ee0ec..0a32ba849 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -39,20 +39,20 @@ fitCyclopsModel <- function( time = .data$survivalTime ) - covariates <- filterCovariateIds(param, trainData$covariateData) + trainData$covariateData$covariates <- filterCovariateIds(param, trainData$covariateData) if (!is.null(param$priorCoefs)) { sourceCoefs <- param$priorCoefs %>% dplyr::filter(abs(.data$betas)>0 & .data$covariateIds != "(Intercept)") - newCovariates <- covariates %>% + newCovariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$covariateId %in% !!sourceCoefs$covariateIds) %>% dplyr::mutate(newCovariateId = .data$covariateId*-1) %>% dplyr::select(-"covariateId") %>% dplyr::rename(covariateId = .data$newCovariateId) %>% dplyr::collect() - Andromeda::appendToTable(covariates, newCovariates) + Andromeda::appendToTable(trainData$covariateData$covariates, newCovariates) } @@ -60,7 +60,7 @@ fitCyclopsModel <- function( cyclopsData <- Cyclops::convertToCyclopsData( outcomes = trainData$covariateData$labels, - covariates = covariates, + covariates = trainData$covariateData$covariates, addIntercept = settings$addIntercept, modelType = modelTypeToCyclopsModelType(settings$modelType), checkRowIds = FALSE, @@ -97,7 +97,8 @@ fitCyclopsModel <- function( ParallelLogger::logInfo("Determining initialRidgeVariance") Cyclops::fitCyclopsModel(cyclopsData, prior = normalPrior, - control = normalControl)}, + control = normalControl, + warnings = FALSE)}, finally = ParallelLogger::logInfo("Done.")) param$priorParams$initialRidgeVariance <- fit$variance } @@ -132,9 +133,17 @@ fitCyclopsModel <- function( )}, finally = ParallelLogger::logInfo('Done.') ) + } else if (settings$manualCV) { + result <- doCyclopsCVPenalty(data = trainData, + prior = prior, + fixedCoefficients = fixedCoefficients, + startingCoefficients = startingCoefficients, + nTries = settings$nTries) + fit <- result$modelFit + hyperParamSearch <- result$hyperParamSearch } else{ fit <- tryCatch({ - ParallelLogger::logInfo('Running Cyclops with fixed varience') + ParallelLogger::logInfo('Running Cyclops with fixed variance') Cyclops::fitCyclopsModel(cyclopsData, prior = prior)}, finally = ParallelLogger::logInfo('Done.')) } @@ -173,7 +182,6 @@ fitCyclopsModel <- function( prediction$evaluationType <- 'Train' # get cv AUC if exists - cvPerFold <- data.frame() if(!is.null(modelTrained$cv)){ cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x){x$predCV})) cvPrediction$evaluationType <- 'CV' @@ -199,6 +207,7 @@ fitCyclopsModel <- function( # remove the cv from the model: modelTrained$cv <- NULL + hyperParamSearch <- cvPerFold } result <- list( @@ -237,7 +246,7 @@ fitCyclopsModel <- function( variance = modelTrained$priorVariance, log_likelihood = modelTrained$log_likelihood ), - hyperParamSearch = cvPerFold + hyperParamSearch = hyperParamSearch ), covariateImportance = variableImportance @@ -542,4 +551,137 @@ reparamTransferCoefs <- function(inCoefs) { coefs <- data.frame(betas = coefs, covariateIds = rownames(coefs), row.names = NULL) return(coefs) +} + +#' do simple CV to determine best penalty manually +#' +#' @details +#' Will try a sequence of penalties from `BIC` down to `penaltyRatio` * `BIC`. How many penalties +#' to try is determined by `nTries.` Will use cross-validation to determine optimal penalty +#' based on `AUC`. +#' +#' @param data The training data +#' @param prior Cyclops prior to use +#' @param fixedCoefficients What coefficients (if any) should be fixed +#' @param startingCoefficients What coefficients (if any) should have some starting value +#' @param penaltyRatio This controls the lowest penalty to try as a ratio of `BIC` penalty. +#' Trying very low penalties will increase computation times a lot. +#' @param nTries How many penalties to try. Default: 10 +doCyclopsCVPenalty <- function(data, + prior, + fixedCoefficients, + startingCoefficients, + penaltyRatio = 0.1, + nTries = 10) { + + # first penalty to try is the strict BIC + startingPenalty <- log(nrow(data$labels)) / 2 + + penalties <- seq(from = startingPenalty, + to = penaltyRatio * startingPenalty, + length.out = nTries) + + nFolds <- length(unique(data$folds$index)) + hyperParamSearch <- data.frame(matrix(nrow=length(penalties), ncol=nFolds+2)) + colnames(hyperParamSearch) <- c("Penalty", "avg_CV", unlist(lapply(seq_len(nFolds), + function(x) paste0("Fold_", x)))) + + ParallelLogger::logInfo("Performing hyperparameter tuning to determine best penalty") + + start <- Sys.time() + + data$covariateData$folds <- data$folds # folds needs to be Andromeda to for joins to work + on.exit(data$covariateData$folds <- NULL) + + data$covariateData$covariates <- data$covariateData$covariates %>% + dplyr::inner_join(data$covariateData$folds, by='rowId') %>% dplyr::collect() + + data$covariateData$labels <- data$covariateData$labels %>% + dplyr::inner_join(data$covariateData$folds, by='rowId') %>% dplyr::collect() + + for (i in seq_along(penalties)) { + hyperParamSearch[i, "Penalty"] <- penalties[i] + prior$penalty <- penalties[i] + + itStart <- Sys.time() + for (fold in seq_len(nFolds)) { + + trainData <- data$covariateData$covariates %>% + dplyr::filter(.data$index!=fold) %>% + dplyr::select(c("rowId", "covariateId", "covariateValue")) %>% + dplyr::collect() + + trainOutcomes <- data$covariateData$labels %>% + dplyr::filter(.data$index!=fold) %>% + dplyr::select(-.data$index) %>% + dplyr::collect() + + cyclopsData <- Cyclops::convertToCyclopsData(outcomes = trainOutcomes, + covariates = trainData, + modelType = 'lr', + addIntercept = TRUE, + checkRowIds = FALSE, + normalize = NULL, + quiet=TRUE) + + fit <- Cyclops::fitCyclopsModel( + cyclopsData = cyclopsData, + prior = prior, + fixedCoefficients = fixedCoefficients, + startingCoefficients = startingCoefficients, + warnings = FALSE) + + + # predict on held out fold + testData <- data$covariateData$covariates %>% + dplyr::filter(.data$index==fold) %>% + dplyr::select(c("rowId", "covariateId", "covariateValue")) %>% + dplyr::mutate(covariateId=as.numeric(.data$covariateId)) %>% + dplyr::collect() + + testOutcomes <- data$covariateData$labels %>% + dplyr::filter(.data$index==fold) %>% + dplyr::select(-.data$index) %>% + dplyr::collect() + + preds <- stats::predict(fit, newCovariates = testData, + newOutcomes = testOutcomes) + + # calculate performance + prediction <- data.frame(outcomeCount=testOutcomes$outcomeCount, + value=preds) + attr(prediction, "metaData")$modelType <- "binary" + auc <- computeAuc(prediction) + hyperParamSearch[i, paste0("Fold_", fold)] <- auc + } + hyperParamSearch[i, "avg_CV"] <- mean(as.numeric(hyperParamSearch[i, seq(3,3 + nFolds - 1)])) + itDelta <- Sys.time() - itStart + ParallelLogger::logInfo(paste0('Hyperparameter iteration no: ', i, ' Penalty: ', signif(prior$penalty, 3), + ' AUC: ', signif(hyperParamSearch[i, "avg_CV"], 3), + ' Iteration Time: ', signif(itDelta, 3), " ", attr(itDelta, "units"))) + } + + bestIndex <- which.max(hyperParamSearch$avg_CV) + bestPenalty <- hyperParamSearch[bestIndex, "Penalty"] + delta <- Sys.time() - start + ParallelLogger::logInfo(paste0("HyperParameter tuning took ", signif(delta, 3), " ", attr(delta, "units"))) + ParallelLogger::logInfo(paste0("Best penalty value is: ", signif(bestPenalty, 4), + " With performance of: ", signif(hyperParamSearch[bestIndex, "avg_CV"]))) + # refit at best value + + prior$penalty <- bestPenalty + cyclopsData <- Cyclops::convertToCyclopsData(data$covariateData$labels, + data$covariateData$covariates, + checkRowIds = FALSE, + quiet=TRUE) + ParallelLogger::logInfo("Refitting model with best penalty on training set") + fit <- Cyclops::fitCyclopsModel( + cyclopsData = cyclopsData, + prior = prior, + fixedCoefficients = fixedCoefficients, + startingCoefficients = startingCoefficients + ) + + results <- list(modelFit = fit, + hyperParamSearch = hyperParamSearch) } \ No newline at end of file diff --git a/R/CyclopsSettings.R b/R/CyclopsSettings.R index 656fb0706..6a7f62dd0 100644 --- a/R/CyclopsSettings.R +++ b/R/CyclopsSettings.R @@ -169,16 +169,18 @@ setCoxModel <- function( #' #' @param K The maximum number of non-zero predictors #' @param penalty Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value +#' If set to `auto` it will do CV to determine best penalty #' @param seed An option to add a seed when training the model #' @param exclude A vector of numbers or covariateId names to exclude from prior #' @param forceIntercept Logical: Force intercept coefficient into regularization #' @param fitBestSubset Logical: Fit final subset with no regularization -#' @param initialRidgeVariance integer or character vector. If set to auto will fit Ridge regression using -#' cross validation to determine best initialRidgeVariance value. +#' @param initialRidgeVariance integer or character vector. If set to `auto` will fit Ridge regression using +#' cross validation to determine best `initialRidgeVariance` value. #' @param tolerance numeric #' @param maxIterations integer #' @param threshold numeric #' @param delta numeric +#' @param nTries If `penalty` is `auto`, how many penalties to include in grid search #' #' @examples #' model.lr <- setLassoLogisticRegression() @@ -194,21 +196,23 @@ setIterativeHardThresholding<- function( tolerance = 1e-08, maxIterations = 10000, threshold = 1e-06, - delta = 0 + delta = 0, + nTries = 10 ){ ensure_installed("IterativeHardThresholding") if(K<1) stop('Invalid maximum number of predictors') - if(!(penalty %in% c("aic", "bic") || is.numeric(penalty))) - stop('Penalty must be "aic", "bic" or numeric') + if(!(penalty %in% c("aic", "bic", "auto") || is.numeric(penalty))) + stop('Penalty must be "aic", "bic", "auto" or numeric') if(!is.logical(forceIntercept)) stop("forceIntercept must be of type: logical") if(!is.logical(fitBestSubset)) stop("fitBestSubset must be of type: logical") if(!inherits(x = seed, what = c('numeric','NULL','integer'))) stop('Invalid seed') + # set seed @@ -242,6 +246,12 @@ setIterativeHardThresholding<- function( name = "Iterative Hard Thresholding" ) + if (penalty == "auto") { + attr(param, 'settings')$manualCV <- TRUE + attr(param, 'settings')$useControl <- FALSE + attr(param, 'settings')$nTries <- nTries + } + attr(param, 'modelType') <- 'binary' attr(param, 'saveType') <- 'RtoJson' diff --git a/R/FeatureEngineering.R b/R/FeatureEngineering.R index e3d0eee44..9d6a6dd81 100644 --- a/R/FeatureEngineering.R +++ b/R/FeatureEngineering.R @@ -10,7 +10,6 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -42,41 +41,13 @@ createFeatureEngineeringSettings <- function(type = 'none'){ } -#' Create the settings for defining any feature selection that will be done -#' -#' @details -#' Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings -#' -#' @param k This function returns the K features most associated (univariately) to the outcome -#' -#' @return -#' An object of class \code{featureEngineeringSettings} -#' @export -createUnivariateFeatureSelection <- function(k = 100){ - - if (inherits(k, 'numeric')) { - k <- as.integer(k) - } - - checkIsClass(k, 'integer') - checkHigherEqual(k, 0) - - featureEngineeringSettings <- list(k = k) - - attr(featureEngineeringSettings, "fun") <- "univariateFeatureSelection" - class(featureEngineeringSettings) <- "featureEngineeringSettings" - - return(featureEngineeringSettings) - -} - #' Create the settings for random foreat based feature selection #' #' @details #' Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings #' -#' @param ntrees number of tree in forest -#' @param maxDepth MAx depth of each tree +#' @param ntrees Number of tree in forest +#' @param maxDepth Max depth of each tree #' #' @return #' An object of class \code{featureEngineeringSettings} @@ -99,72 +70,11 @@ createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){ return(featureEngineeringSettings) } -univariateFeatureSelection <- function( - trainData, - featureEngineeringSettings, - covariateIdsInclude = NULL){ - - if(is.null(covariateIdsInclude)){ - #convert data into matrix: - mappedData <- toSparseM(trainData, trainData$labels) - - matrixData <- mappedData$dataMatrix - labels <- mappedData$labels - covariateMap <- mappedData$covariateMap - - X <- reticulate::r_to_py(matrixData) - y <- reticulate::r_to_py(labels[,'outcomeCount']) - - np <- reticulate::import('numpy') - os <- reticulate::import('os') - sys <- reticulate::import('sys') - math <- reticulate::import('math') - scipy <- reticulate::import('scipy') - - sklearn <- reticulate::import('sklearn') - - SelectKBest <- sklearn$feature_selection$SelectKBest - chi2 <- sklearn$feature_selection$chi2 - - kbest <- SelectKBest(chi2, k = featureEngineeringSettings$k)$fit(X, y$outcomeCount) - kbest$scores_ <- np$nan_to_num(kbest$scores_) - - # taken from sklearn code, matches the application during transform call - k <- featureEngineeringSettings$k - mask <- np$zeros(length(kbest$scores_), dtype='bool') - mask[np$argsort(kbest$scores_, kind="mergesort")+1][(length(kbest$scores_)-k+1):length(kbest$scores_)] <- TRUE - - covariateIdsInclude <- covariateMap[mask,]$covariateId - } - - trainData$covariateData$covariates <- trainData$covariateData$covariates %>% - dplyr::filter(.data$covariateId %in% covariateIdsInclude) - - trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% - dplyr::filter(.data$covariateId %in% covariateIdsInclude) - - featureEngineering <- list( - funct = 'univariateFeatureSelection', - settings = list( - featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = covariateIdsInclude - ) - ) - - attr(trainData, 'metaData')$featureEngineering = listAppend( - attr(trainData, 'metaData')$featureEngineering, - featureEngineering - ) - - return(trainData) - -} - randomForestFeatureSelection <- function( trainData, featureEngineeringSettings, - covariateIdsInclude = NULL + covariateIdsInclude = NULL # equivalent to covariateIdsSelected in VariableSelection.R ){ if(is.null(covariateIdsInclude)){ @@ -231,7 +141,7 @@ randomForestFeatureSelection <- function( } - +# Help function to apply any feature engineering method featureEngineer <- function(data, featureEngineeringSettings){ ParallelLogger::logInfo('Starting Feature Engineering') diff --git a/R/NJMIM.R b/R/NJMIM.R deleted file mode 100644 index d55a6538b..000000000 --- a/R/NJMIM.R +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -#' Create settings using normalized joint mutual information maximization (NJMIM) for feature selection -#' @param k number of features to select -#' @export -njmimSettings <- function(k=20) { - - ensure_installed('praznik') - - checkIsClass(k, c('integer', 'numeric')) - checkHigherEqual(k, 0) - - featureEngineeringSettings <- list(k = k) - - attr(featureEngineeringSettings, "fun") <- "njmimFeatureSelection" - class(featureEngineeringSettings) <- "featureEngineeringSettings" - - return(featureEngineeringSettings) -} - -njmimFeatureSelection <- function(trainData, - featureEngineeringSettings, - covariateIdsInclude = NULL) { - if (is.null(covariateIdsInclude)) { - sparseData <- toSparseM(trainData, trainData$labels) - denseMatrix <- as.matrix(sparseData$dataMatrix) - dataFrame <- as.data.frame(denseMatrix) - y <- sparseData$labels$outcomeCount - - selection <- praznik::NJMIM(X = dataFrame, - Y = y, - k = featureEngineeringSettings$k) - - covariateIdsInclude <- sparseData$covariateMap %>% - dplyr::filter(columnId %in% selection$selection) %>% - dplyr::pull(covariateId) - } - - - trainData$covariateData$covariates <- trainData$covariateData$covariates %>% - dplyr::filter(covariateId %in% covariateIdsInclude) - trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% - dplyr::filter(covariateId %in% covariateIdsInclude) - - featureEngineering <- list( - funct = 'njmimFeatureSelection', - settings = list( - featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = covariateIdsInclude - ) - ) - - attr(trainData, 'metaData')$featureEngineering = listAppend( - attr(trainData, 'metaData')$featureEngineering, - featureEngineering - ) - - return(trainData) -} \ No newline at end of file diff --git a/R/RunPlp.R b/R/RunPlp.R index 2f9cd0199..a4b45f340 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -219,7 +219,7 @@ runPlp <- function( executeSettings = createDefaultExecuteSettings(), saveDirectory = getwd() ){ - + start <- Sys.time() # start log analysisPath <- file.path(saveDirectory, analysisId) logSettings$saveDirectory <- analysisPath @@ -513,7 +513,8 @@ runPlp <- function( tryCatch(savePlpResult(results, file.path(analysisPath,'plpResult')), finally= ParallelLogger::logTrace('Done.')) ParallelLogger::logInfo(paste0('plpResult saved to ..\\', analysisPath ,'\\plpResult')) - + delta <- Sys.time() - start + ParallelLogger::logInfo("RunPLP took ", signif(delta, 3), " ", attr(delta, "units")) return(results) } diff --git a/R/VariableSelection.R b/R/VariableSelection.R index 7ff6037c3..7530831a8 100644 --- a/R/VariableSelection.R +++ b/R/VariableSelection.R @@ -16,216 +16,287 @@ # See the License for the specific language governing permissions and # limitations under the License. -#' Selects features based on univariate statistics -#' -#' @param corMethod which type of correlation to use, `pearson`, `kendall` or `spearman`. default `pearson` -#' @param modelSettings settings of model to use in fit after selecting variables -#' @param nVariables amount of variables to select, default `50` -#' + +#' Create settings using normalized joint mutual information maximization (NJMIM) for feature selection +#' @param k number of variables to select, default `20` #' @export -setUnivariateSelection <- function(modelSettings = PatientLevelPrediction::setLassoLogisticRegression(), - corMethod = "pearson", - nVariables = 50) { # TODO: set dynamic number based on elbow +njmimSettings <- function(k = 20) { - checkIsClass(nVariables, c('numeric','integer')) - if (!corMethod %in% c("pearson", "kendall", "spearman")) { - stop("corMethod needs to be either 'pearson', 'kendall' or 'spearman'") + ensure_installed('praznik') + + checkIsClass(k, c('integer', 'numeric')) + checkHigherEqual(k, 0) + + featureEngineeringSettings <- list(k = k) + + attr(featureEngineeringSettings, "fun") <- "njmimFeatureSelection" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + + return(featureEngineeringSettings) +} + +njmimFeatureSelection <- function(trainData, + featureEngineeringSettings, + covariateIdsSelected = NULL) { + if (is.null(covariateIdsSelected)) { + sparseData <- toSparseM(trainData, trainData$labels) + denseMatrix <- as.matrix(sparseData$dataMatrix) + dataFrame <- as.data.frame(denseMatrix) + y <- sparseData$labels$outcomeCount + + if (ncol(dataFrame) > featureEngineeringSettings$k) { + selection <- praznik::NJMIM(X = dataFrame, + Y = y, + k = featureEngineeringSettings$k) + + covariateIdsSelected <- sparseData$covariateMap %>% + dplyr::filter(columnId %in% selection$selection) %>% + dplyr::pull(covariateId) + } else { + # return all covariates + covariateIdsSelected <- sparseData$covariateMap %>% + dplyr::pull(covariateId) + } + } - # TODO: add class checks input (modelSettings, corMethod)? - param <- list( - modelSettings = modelSettings, - param = list( - corMethod = corMethod, - nVariables = nVariables + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(covariateId %in% covariateIdsSelected) + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(covariateId %in% covariateIdsSelected) + + featureEngineering <- list( + funct = 'njmimFeatureSelection', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + covariateIdsSelected = covariateIdsSelected ) ) - result <- list( - fitFunction = "fitUnivariateSelection", - param = param + attr(trainData, 'metaData')$featureEngineering = listAppend( + attr(trainData, 'metaData')$featureEngineering, + featureEngineering ) - class(result) <- "modelSettings" - return(result) + return(trainData) } -fitUnivariateSelection <- function( - trainData, - modelSettings, - search = 'none', - analysisId, - stop = F, - ...) { +#' Create settings using univariate statistics for feature selection +#' @param corMethod which type of correlation to use, `pearson`, `kendall` or `spearman`. default `pearson` +#' @param k number of variables to select, default `20` +#' +#' @export +univariateSettings <- function(k = 20, # TODO: set dynamic number based on elbow + corMethod = "pearson"){ - # Settings variable selection - param <- modelSettings$param$param + if (inherits(k, 'numeric')) { + k <- as.integer(k) + } - covariates <- as.data.frame(trainData$covariateData$covariates) + checkIsClass(k, 'integer') + checkHigherEqual(k, 0) - if (length(unique(covariates$covariateId)) > param$nVariables) { - outcomes <- trainData$labels[, c("rowId", "outcomeCount")] - - # Select features based on univariate association with outcome - correlation <- sapply(unique(covariates$covariateId), function(covId) { - stats::cor(ifelse(outcomes$rowId %in% covariates$rowId[covariates$covariateId == covId], 1, 0), # TODO: can this be done smarter for sparse data - outcomes$outcomeCount, - method = param$corMethod) - }) - names(correlation) <- unique(covariates$covariateId) - - # Order from high to low absolute correlation - correlation <- correlation[order(abs(correlation), decreasing = TRUE)] - - # Select variables - selected <- names(correlation)[1:min(param$nVariables, length(correlation))] - - # Update covariate data - trainData$covariateData <- updateCovariateData(trainData$covariateData, covIds=selected, update="select") + if (!corMethod %in% c("pearson", "kendall", "spearman")) { + stop("corMethod needs to be either 'pearson', 'kendall' or 'spearman'") + } + + featureEngineeringSettings <- list(corMethod = corMethod, + k = k) + + attr(featureEngineeringSettings, "fun") <- "univariateFeatureSelection" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + + return(featureEngineeringSettings) +} + + +univariateFeatureSelection <- function(trainData, + featureEngineeringSettings, + covariateIdsSelected = NULL) { + if (is.null(covariateIdsSelected)) { + sparseData <- toSparseM(trainData, trainData$labels) + denseMatrix <- as.matrix(sparseData$dataMatrix) + dataFrame <- as.data.frame(denseMatrix) + y <- sparseData$labels$outcomeCount - ParallelLogger::logTrace('Finished variable selection.') - } else { - ParallelLogger::logTrace('No variable selection, number of covariates less than or equal to nVariables.') + if (ncol(dataFrame) > featureEngineeringSettings$k) { + # Select features based on univariate association with outcome + correlation <- sapply(1:ncol(dataFrame), function(col) { + stats::cor(dataFrame[,col], y, method = featureEngineeringSettings$corMethod) + }) + names(correlation) <- 1:ncol(dataFrame) + + # Order from high to low absolute correlation + correlation <- correlation[order(abs(correlation), decreasing = TRUE)] + + # Select variables + selected <- names(correlation)[1:min(featureEngineeringSettings$k, length(correlation))] + + covariateIdsSelected <- sparseData$covariateMap %>% + dplyr::filter(columnId %in% selected) %>% + dplyr::pull(covariateId) + + } else { + # return all covariates + covariateIdsSelected <- sparseData$covariateMap %>% + dplyr::pull(covariateId) + } } - # Fit final PLP model - fun <- eval(parse(text = modelSettings$param$modelSettings$fitFunction)) - args <- list( - trainData = trainData, - modelSettings = modelSettings$param$modelSettings, - search = search, - analysisId = analysisId + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(covariateId %in% covariateIdsSelected) + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(covariateId %in% covariateIdsSelected) + + featureEngineering <- list( + funct = 'univariateFeatureSelection', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + covariateIdsSelected = covariateIdsSelected + ) + ) + + attr(trainData, 'metaData')$featureEngineering = listAppend( + attr(trainData, 'metaData')$featureEngineering, + featureEngineering ) - plpModel <- do.call(fun, args) - return(plpModel) + return(trainData) } -#' Selects features using stepwise selection -#' -#' @param modelSettings settings of model to use in fit after selecting variables + + +#' Create settings using stepwise selection for feature selection +#' @param k number of variables to select, default `20` #' @param selectMethod `backward` or `forward` selection -#' @param nInitialVariables # of variables to select initially -#' @param nVariables amount of variables to select, default `20` +#' @param kStart number of variables to select initially #' @param stepSize How many variables to add/remove in each step -#' +#' @param modelSettings settings of model to use in fit after selecting variables #' @export -setStepwiseSelection <- function(modelSettings, - selectMethod = "backward", - nInitialVariables = 50, # TODO: set dynamic number based on elbow - nVariables = 20, # TODO: set dynamic number based on elbow - stepSize = 1) { - - checkIsClass(nInitialVariables, c('numeric','integer')) - checkIsClass(nVariables, c('numeric','integer')) - checkIsClass(stepSize, c('numeric','integer')) +stepwiseSettings <- function(k = 20, # TODO: set dynamic number based on elbow + selectMethod = "backward", + kStart = 100, # TODO: set dynamic number based on elbow + stepSize = 1, + modelSettings = PatientLevelPrediction::setLassoLogisticRegression()) { + + checkIsClass(k, c('integer', 'numeric')) + checkHigherEqual(k, 0) + if (!selectMethod %in% c("forward", "backward")) { stop("selectMethod needs to be either 'forward' or 'backward") } - # TODO: add class checks input (modelSettings, selectMethod)? - - param <- list( - modelSettings = PatientLevelPrediction::setLassoLogisticRegression(), - param = list( - selectMethod = selectMethod, - nInitialVariables = nInitialVariables, # For classifiers with initial selection based on variable importance - nVariables = nVariables, - stepSize = stepSize # Number of variables removed at the same time - ) - ) - result <- list( - fitFunction = "fitStepwiseSelection", - param = param - ) - class(result) <- "modelSettings" + checkIsClass(kStart, c('integer', 'numeric')) + checkHigherEqual(kStart, 0) - return(result) + checkIsClass(stepSize, c('integer', 'numeric')) + checkHigherEqual(stepSize, 0) + + featureEngineeringSettings <- list(k = k, + selectMethod = selectMethod, + kStart = kStart, + stepSize = stepSize, + modelSettings = modelSettings) + + attr(featureEngineeringSettings, "fun") <- "stepwiseFeatureSelection" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + + return(featureEngineeringSettings) } -fitStepwiseSelection <- function( - trainData, - modelSettings, - search = 'none', - analysisId, - stop = F, - ...) { - - # Initial fit PLP model - fun <- eval(parse(text = modelSettings$param$modelSettings$fitFunction)) - args <- list( - trainData = trainData, - modelSettings = modelSettings$param$modelSettings, - search = search, - analysisId = analysisId - ) - plpModel <- do.call(fun, args) - - if (modelSettings$param$param$selectMethod == "backward") { - # Initial selection - # Select non-zero coefficients for LASSO - covIds <- plpModel$model$coefficients$covariateIds[plpModel$model$coefficients$betas != 0] +stepwiseFeatureSelection <- function(trainData, + featureEngineeringSettings, + covariateIdsSelected = NULL) { + if (is.null(covariateIdsSelected)) { + search <- 999 # TODO: which number to use? + analysisId <- 999 # TODO: which number to use? - # TODO: make generic across algorithms based on var importance (e.g. randomForest) - # covIds <- TODO + # Initial fit PLP model + fun <- eval(parse(text = featureEngineeringSettings$modelSettings$fitFunction)) + args <- list( + trainData = trainData, + modelSettings = featureEngineeringSettings$modelSettings, + search = search, + analysisId = analysisId + ) + plpModel <- do.call(fun, args) - fullCovariates <- NULL - updateIteration <- "remove" + if (featureEngineeringSettings$selectMethod == "backward") { + # Initial selection + # Select non-zero coefficients for LASSO + covIds <- plpModel$model$coefficients$covariateIds[plpModel$model$coefficients$betas != 0] + + # TODO: make generic across algorithms based on var importance (e.g. randomForest) + # covIds <- TODO + + fullCovariates <- NULL + updateIteration <- "remove" + + } else if (featureEngineeringSettings$selectMethod == "forward") { + # Initial selection (empty) + covIds <- NULL + + fullCovariates <- as.data.frame(trainData$covariateData$covariates) + updateIteration <- "add" + + } else { + stop("Variable selection stopped: selectMethod not implemented.") + } - } else if (modelSettings$param$param$selectMethod == "forward") { - # Initial selection (empty) - covIds <- NULL + update <- "select" # Only first update of covariates + update_trainData <- trainData - fullCovariates <- as.data.frame(trainData$covariateData$covariates) - updateIteration <- "add" + while(!is.null(covIds) | update == "select") { # Stop when selected is NULL + # Update covariate data + update_trainData$covariateData <- updateCovariateData(update_trainData$covariateData, covIds=covIds, update=update, fullCovariates=fullCovariates) + + # Backward or forward select variables + update <- updateIteration + covIds <- selectVariables(featureEngineeringSettings, update_trainData, fullCovariates, search, analysisId) + } - } else { - stop("Variable selection stopped: selectMethod not implemented.") + covariates <- as.data.frame(update_trainData$covariateData$covariates) + covariateIdsSelected <- unique(covariates$covariateId) } - update <- "select" # Only first update of covariates - while(!is.null(covIds) | update == "select") { # Stop when selected is NULL - # Update covariate data - trainData$covariateData <- updateCovariateData(trainData$covariateData, covIds=covIds, update=update, fullCovariates=fullCovariates) - - # Backward or forward select variables - update <- updateIteration - covIds <- selectVariables(modelSettings, trainData, fullCovariates, search, analysisId) - } + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(covariateId %in% covariateIdsSelected) + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(covariateId %in% covariateIdsSelected) - ParallelLogger::logTrace('Finished variable selection.') + featureEngineering <- list( + funct = 'univariateFeatureSelection', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + covariateIdsSelected = covariateIdsSelected + ) + ) - # Fit final PLP model - fun <- eval(parse(text = modelSettings$param$modelSettings$fitFunction)) - args <- list( - trainData = trainData, - modelSettings = modelSettings$param$modelSettings, - search = search, - analysisId = analysisId + attr(trainData, 'metaData')$featureEngineering = listAppend( + attr(trainData, 'metaData')$featureEngineering, + featureEngineering ) - plpModel <- do.call(fun, args) - return(plpModel) + return(trainData) } -selectVariables <- function(modelSettings, trainData, fullCovariates, search, analysisId) { +# Help function stepwiseFeatureSelection +selectVariables <- function(featureEngineeringSettings, trainData, fullCovariates, search, analysisId) { # Settings variable selection - param <- modelSettings$param$param - covariates <- as.data.frame(trainData$covariateData$covariates) - if (param$selectMethod == "backward") { + if (featureEngineeringSettings$selectMethod == "backward") { update <- "remove" - start <- (length(unique(covariates$covariateId)) > param$nVariables) # Too many variables + start <- (length(unique(covariates$covariateId)) > featureEngineeringSettings$k) # Too many variables covariateList <- unique(covariates$covariateId) - } else if (param$selectMethod == "forward") { + } else if (featureEngineeringSettings$selectMethod == "forward") { update <- "add" - start <- (length(unique(covariates$covariateId)) < param$nVariables) # Not enough variables + start <- (length(unique(covariates$covariateId)) < featureEngineeringSettings$k) # Not enough variables covariateList <- unique(fullCovariates$covariateId)[!(unique(fullCovariates$covariateId) %in% unique(covariates$covariateId))] } - # TODO: make correction for param$stepSize to come to exactly the right number of variables + # TODO: make correction for featureEngineeringSettings$stepSize to come to exactly the right number of variables if (start) { performance <- sapply(covariateList, function(covId) { @@ -234,10 +305,10 @@ selectVariables <- function(modelSettings, trainData, fullCovariates, search, an tempData$covariateData <- updateCovariateData(tempData$covariateData, covIds=covId, update=update, fullCovariates=fullCovariates) # Re-fit PLP model - fun <- eval(parse(text = modelSettings$param$modelSettings$fitFunction)) + fun <- eval(parse(text = featureEngineeringSettings$modelSettings$fitFunction)) args <- list( trainData = tempData, - modelSettings = modelSettings$param$modelSettings, + modelSettings = featureEngineeringSettings$modelSettings, search = search, analysisId = analysisId ) @@ -251,19 +322,19 @@ selectVariables <- function(modelSettings, trainData, fullCovariates, search, an }) names(performance) <- covariateList - if (param$selectMethod == "backward") { + if (featureEngineeringSettings$selectMethod == "backward") { # Order from low to high performance (minimum negative log likelihood is high) performance <- performance[order(performance, decreasing = TRUE)] # Select variables to remove - covIds <- names(performance)[1:min(param$stepSize, length(performance))] + covIds <- names(performance)[1:min(featureEngineeringSettings$stepSize, length(performance))] - } else if (param$selectMethod == "forward") { + } else if (featureEngineeringSettings$selectMethod == "forward") { # Order from high to low performance (minimum negative log likelihood is high) performance <- performance[order(performance, decreasing = FALSE)] # Select variables to add - covIds <- names(performance)[1:min(param$stepSize, length(performance))] + covIds <- names(performance)[1:min(featureEngineeringSettings$stepSize, length(performance))] } return(covIds) @@ -272,6 +343,7 @@ selectVariables <- function(modelSettings, trainData, fullCovariates, search, an return(NULL) # Return NULL to initiate stop } +# Help function stepwiseFeatureSelection updateCovariateData <- function(covariateData, covIds, update="select", fullCovariates=NULL) { # FeatureExtraction -> excludedCovariateConceptIds: A list of concept IDs that should NOT be used to construct covariates. newCovariates <- as.data.frame(covariateData$covariates) # TODO: try without @@ -304,3 +376,142 @@ updateCovariateData <- function(covariateData, covIds, update="select", fullCova } +#' Create the settings for Boruta feature selection +#' +#' @details +#' From: https://doi.org/10.18637/jss.v036.i11 +#' +#' @param nJobs How many jobs to do in parallel +#' @param maxDepth Max depth of each tree in the RandomForest used +#' @param nTrees How many trees to use, default is `auto` +#' @param verbosity 0 for silent, 1 to display iteration number, 2 to display features selected as well +#' @param iterations How many iterations to run `Boruta` for. Default: 100 +#' @param randomState Either `NULL` or an integer. If integer it is the seed used by the random number generator +#' +#' @return +#' An object of class \code{featureEngineeringSettings} +#' @export +borutaSettings <- function(nJobs = -1L, + maxDepth = 5L, + nTrees = "auto", + verbosity = 2L, + iterations = 100L, + randomState = 42L +){ + # check python environment + tryCatch(reticulate::import('numpy'), error = function(e) stop("Numpy must be available in python environment")) + tryCatch(reticulate::import('boruta'), error= function(e) stop("Boruta must be installed in the python environment")) + tryCatch(reticulate::import('sklearn'), error= function(e) stop("sklearn must be installed in the python environment")) + + # check input variables + if (inherits(nJobs, "numeric")) { + nJobs <- as.integer(nJobs) + } + if (inherits(maxDepth, "numeric")) { + maxDepth <- as.integer(maxDepth) + } + if (inherits(nTrees, "numeric")) { + nTrees <- as.integer(nTrees) + } + if (inherits(verbosity, "numeric")) { + verbosity <- as.integer(verbosity) + } + if (inherits(iterations, "numeric")) { + iterations <- as.integer(iterations) + } + if (inherits(randomState, "numeric")) { + randomState <- as.integer(randomState) + } + + + checkIsClass(nTrees, c('integer', "character")) + checkIsClass(maxDepth, c('integer')) + checkIsClass(randomState, c("integer", "NULL")) + if (inherits(nTrees, c("integer"))) { + checkHigher(nTrees, 0) + } else { + if (nTrees != "auto") { + stop("nTrees should be either an integer or 'auto'") + } + } + checkHigher(maxDepth, 0) + checkHigher(iterations, 0) + + if (!verbosity %in% c(0L, 1L, 2L)) { + stop(paste0("verbosity must be one of 0, 1, 2. You supplied: ", verbosity)) + } + + featureEngineeringSettings <- list( + nTrees = nTrees, + maxDepth = maxDepth, + iterations = iterations, + verbosity = verbosity, + randomState = randomState, + nJobs = nJobs + ) + + attr(featureEngineeringSettings, "fun") <- "borutaFeatureSelection" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + + return(featureEngineeringSettings) +} + +borutaFeatureSelection <- function( + trainData, + featureEngineeringSettings, + covariateIdsSelected = NULL +){ + + if(is.null(covariateIdsSelected)){ + sparseData <- toSparseM(trainData) + dataMatrix <- sparseData$dataMatrix + covariateMap <- sparseData$covariateMap + + X <- reticulate::r_to_py(dataMatrix) + y <- reticulate::r_to_py(matrix(sparseData$labels$outcomeCount, ncol=1)) + + sklearn <- reticulate::import('sklearn') + BorutaPy <- reticulate::import('boruta')$BorutaPy + + rf = sklearn$ensemble$RandomForestClassifier( + max_depth = featureEngineeringSettings$maxDepth, + n_jobs = featureEngineeringSettings$nJobs, + ) + + featureSelector <- BorutaPy(rf, n_estimators=featureEngineeringSettings$nTrees, + verbose=featureEngineeringSettings$verbosity, + random_state=featureEngineeringSettings$randomState, + max_iter=featureEngineeringSettings$iterations) + + featureSelector$fit(X, y$squeeze()) + + includedFeatures <- featureSelector$support_ + + covariateIdsSelected <- covariateMap %>% + dplyr::filter(.data$columnId %in% which(includedFeatures)) %>% + dplyr::select("covariateId") %>% dplyr::arrange("covariateId") %>% + dplyr::pull() + } + + trainData$covariateData$covariates <- trainData$covariateData$covariates %>% + dplyr::filter(.data$covariateId %in% covariateIdsSelected) + + trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>% + dplyr::filter(.data$covariateId %in% covariateIdsSelected) + + featureEngineering <- list( + funct = 'borutaFeatureSelection', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + covariateIdsSelected = covariateIdsSelected + ) + ) + + attr(trainData, 'metaData')$featureEngineering = listAppend( + attr(trainData, 'metaData')$featureEngineering, + featureEngineering + ) + + return(trainData) + +} diff --git a/R/ViewShinyPlp.R b/R/ViewShinyPlp.R index f80c22f1e..745f0f57a 100644 --- a/R/ViewShinyPlp.R +++ b/R/ViewShinyPlp.R @@ -50,7 +50,7 @@ viewPlp <- function(runPlp, validatePlp = NULL, diagnosePlp = NULL) { runPlp = runPlp, externalValidatePlp = validatePlp, diagnosePlp = diagnosePlp - ) + ) connectionDetailSettings <- list( dbms = 'sqlite', @@ -69,7 +69,7 @@ viewPlp <- function(runPlp, validatePlp = NULL, diagnosePlp = NULL) { ) viewPlps(databaseSettings) - + } @@ -88,14 +88,14 @@ viewPlp <- function(runPlp, validatePlp = NULL, diagnosePlp = NULL) { #' #' @export viewDatabaseResultPlp <- function( - mySchema, - myServer, - myUser, - myPassword, - myDbms, - myPort = NULL, - myTableAppend - ){ + mySchema, + myServer, + myUser, + myPassword, + myDbms, + myPort = NULL, + myTableAppend +){ connectionDetailSettings <- list( dbms = myDbms, @@ -132,19 +132,49 @@ viewPlps <- function(databaseSettings){ connectionDetails <- do.call( DatabaseConnector::createConnectionDetails, databaseSettings$connectionDetailSettings - ) + ) connection <- ResultModelManager::ConnectionHandler$new(connectionDetails) databaseSettings$connectionDetailSettings <- NULL - # set database settings into system variables - Sys.setenv("resultDatabaseDetails_prediction" = as.character(ParallelLogger::convertSettingsToJson(databaseSettings))) - - config <- ParallelLogger::loadSettingsFromJson( - fileName = system.file( - 'shinyConfig.json', - package = "PatientLevelPrediction" + shinyAppVersion <- strsplit(x = as.character(utils::packageVersion('ShinyAppBuilder')), split = '\\.')[[1]] + + if((shinyAppVersion[1] <= 1 & shinyAppVersion[2] < 2)){ + # Old code to be backwards compatable + config <- ParallelLogger::loadSettingsFromJson( + fileName = system.file( + 'shinyConfig.json', + package = "PatientLevelPrediction" + ) + ) + # set database settings into system variables + Sys.setenv("resultDatabaseDetails_prediction" = as.character(ParallelLogger::convertSettingsToJson(databaseSettings))) + ShinyAppBuilder::viewShiny( + config = config, + connection = connection ) + } else{ + ohdsiModulesVersion <- strsplit(x = as.character(utils::packageVersion('OhdsiShinyModules')), split = '\\.')[[1]] + if(paste0(ohdsiModulesVersion[1], ".", ohdsiModulesVersion[2])>= 1.2){ + config <- ParallelLogger::loadSettingsFromJson( + fileName = system.file( + 'shinyConfigUpdate.json', + package = "PatientLevelPrediction" + ) ) + databaseSettings$plpTablePrefix = databaseSettings$tablePrefix + databaseSettings$cgTablePrefix = databaseSettings$tablePrefix + databaseSettings$databaseTable = 'database_meta_table' + databaseSettings$databaseTablePrefix = databaseSettings$tablePrefix + ShinyAppBuilder::viewShiny( + config = config, + connection = connection, + resultDatabaseSettings = databaseSettings + ) + } else{ + ParallelLogger::logWarn('Need to update package OhdsiShinyModules') + } + + } + - ShinyAppBuilder::viewShiny(config = config, connection = connection) } \ No newline at end of file diff --git a/inst/doc/AddingCustomFeatureEngineering.pdf b/inst/doc/AddingCustomFeatureEngineering.pdf deleted file mode 100644 index e4e8220ce..000000000 Binary files a/inst/doc/AddingCustomFeatureEngineering.pdf and /dev/null differ diff --git a/inst/doc/AddingCustomModels.pdf b/inst/doc/AddingCustomModels.pdf deleted file mode 100644 index f58d39dbb..000000000 Binary files a/inst/doc/AddingCustomModels.pdf and /dev/null differ diff --git a/inst/doc/AddingCustomSamples.pdf b/inst/doc/AddingCustomSamples.pdf deleted file mode 100644 index e85e0baaf..000000000 Binary files a/inst/doc/AddingCustomSamples.pdf and /dev/null differ diff --git a/inst/doc/AddingCustomSplitting.pdf b/inst/doc/AddingCustomSplitting.pdf deleted file mode 100644 index 6bc846f51..000000000 Binary files a/inst/doc/AddingCustomSplitting.pdf and /dev/null differ diff --git a/inst/doc/BuildingDeepLearningModels.pdf b/inst/doc/BuildingDeepLearningModels.pdf deleted file mode 100644 index a496daccb..000000000 Binary files a/inst/doc/BuildingDeepLearningModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingEnsembleModels.pdf b/inst/doc/BuildingEnsembleModels.pdf deleted file mode 100644 index 37e714ebf..000000000 Binary files a/inst/doc/BuildingEnsembleModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingMultiplePredictiveModels.pdf b/inst/doc/BuildingMultiplePredictiveModels.pdf deleted file mode 100644 index 31c3cb98a..000000000 Binary files a/inst/doc/BuildingMultiplePredictiveModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingPredictiveModels.pdf b/inst/doc/BuildingPredictiveModels.pdf deleted file mode 100644 index fd77a14bc..000000000 Binary files a/inst/doc/BuildingPredictiveModels.pdf and /dev/null differ diff --git a/inst/doc/CreatingLearningCurves.pdf b/inst/doc/CreatingLearningCurves.pdf deleted file mode 100644 index cfa6cf8ef..000000000 Binary files a/inst/doc/CreatingLearningCurves.pdf and /dev/null differ diff --git a/inst/doc/CreatingNetworkstudies.pdf b/inst/doc/CreatingNetworkstudies.pdf deleted file mode 100644 index 57a187a82..000000000 Binary files a/inst/doc/CreatingNetworkstudies.pdf and /dev/null differ diff --git a/inst/doc/CreatingShinyApp.pdf b/inst/doc/CreatingShinyApp.pdf deleted file mode 100644 index 6ad7fa645..000000000 Binary files a/inst/doc/CreatingShinyApp.pdf and /dev/null differ diff --git a/inst/doc/InstallationGuide.pdf b/inst/doc/InstallationGuide.pdf deleted file mode 100644 index 605054361..000000000 Binary files a/inst/doc/InstallationGuide.pdf and /dev/null differ diff --git a/inst/shinyConfig.json b/inst/shinyConfig.json index f2a674932..6034907d4 100644 --- a/inst/shinyConfig.json +++ b/inst/shinyConfig.json @@ -7,11 +7,8 @@ "shinyModulePackage": "OhdsiShinyModules", "uiFunction": "aboutViewer", "serverFunction": "aboutServer", - "databaseConnectionKeyService": null, - "databaseConnectionKeyUsername": null, "infoBoxFile": "aboutHelperFile()", "icon": "info", - "keyring": true, "order": 1 }, { @@ -19,14 +16,11 @@ "tabName": "Prediction", "tabText": "Prediction", "shinyModulePackage": "OhdsiShinyModules", - "uiFunction": "predictionViewer", - "serverFunction": "predictionServer", - "databaseConnectionKeyService": "resultDatabaseDetails", - "databaseConnectionKeyUsername": "prediction", - "infoBoxFile": "predictionHelperFile()", + "uiFunction": "patientLevelPredictionViewer", + "serverFunction": "patientLevelPredictionServer", + "infoBoxFile": "patientLevelPredictionHelperFile()", "icon": "chart-line", - "keyring": false, "order": 2 } ] -} +} \ No newline at end of file diff --git a/man/borutaSettings.Rd b/man/borutaSettings.Rd new file mode 100644 index 000000000..488a4f66a --- /dev/null +++ b/man/borutaSettings.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VariableSelection.R +\name{borutaSettings} +\alias{borutaSettings} +\title{Create the settings for Boruta feature selection} +\usage{ +borutaSettings( + nJobs = -1L, + maxDepth = 5L, + nTrees = "auto", + verbosity = 2L, + iterations = 100L, + randomState = 42L +) +} +\arguments{ +\item{nJobs}{How many jobs to do in parallel} + +\item{maxDepth}{Max depth of each tree in the RandomForest used} + +\item{nTrees}{How many trees to use, default is `auto`} + +\item{verbosity}{0 for silent, 1 to display iteration number, 2 to display features selected as well} + +\item{iterations}{How many iterations to run `Boruta` for. Default: 100} + +\item{randomState}{Either `NULL` or an integer. If integer it is the seed used by the random number generator} +} +\value{ +An object of class \code{featureEngineeringSettings} +} +\description{ +Create the settings for Boruta feature selection +} +\details{ +From: https://doi.org/10.18637/jss.v036.i11 +} diff --git a/man/createRandomForestFeatureSelection.Rd b/man/createRandomForestFeatureSelection.Rd index 02eb82377..ca7157573 100644 --- a/man/createRandomForestFeatureSelection.Rd +++ b/man/createRandomForestFeatureSelection.Rd @@ -7,9 +7,9 @@ createRandomForestFeatureSelection(ntrees = 2000, maxDepth = 17) } \arguments{ -\item{ntrees}{number of tree in forest} +\item{ntrees}{Number of tree in forest} -\item{maxDepth}{MAx depth of each tree} +\item{maxDepth}{Max depth of each tree} } \value{ An object of class \code{featureEngineeringSettings} diff --git a/man/createUnivariateFeatureSelection.Rd b/man/createUnivariateFeatureSelection.Rd deleted file mode 100644 index 3be0dcc83..000000000 --- a/man/createUnivariateFeatureSelection.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/FeatureEngineering.R -\name{createUnivariateFeatureSelection} -\alias{createUnivariateFeatureSelection} -\title{Create the settings for defining any feature selection that will be done} -\usage{ -createUnivariateFeatureSelection(k = 100) -} -\arguments{ -\item{k}{This function returns the K features most associated (univariately) to the outcome} -} -\value{ -An object of class \code{featureEngineeringSettings} -} -\description{ -Create the settings for defining any feature selection that will be done -} -\details{ -Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings -} diff --git a/man/doCyclopsCVPenalty.Rd b/man/doCyclopsCVPenalty.Rd new file mode 100644 index 000000000..35483ff9b --- /dev/null +++ b/man/doCyclopsCVPenalty.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CyclopsModels.R +\name{doCyclopsCVPenalty} +\alias{doCyclopsCVPenalty} +\title{do simple CV to determine best penalty manually} +\usage{ +doCyclopsCVPenalty( + data, + prior, + fixedCoefficients, + startingCoefficients, + penaltyRatio = 0.1, + nTries = 10 +) +} +\arguments{ +\item{data}{The training data} + +\item{prior}{Cyclops prior to use} + +\item{fixedCoefficients}{What coefficients (if any) should be fixed} + +\item{startingCoefficients}{What coefficients (if any) should have some starting value} + +\item{penaltyRatio}{This controls the lowest penalty to try as a ratio of `BIC` penalty. +Trying very low penalties will increase computation times a lot.} + +\item{nTries}{How many penalties to try. Default: 10} +} +\description{ +do simple CV to determine best penalty manually +} +\details{ +Will try a sequence of penalties from `BIC` down to `penaltyRatio` * `BIC`. How many penalties +to try is determined by `nTries.` Will use cross-validation to determine optimal penalty +based on `AUC`. +} diff --git a/man/njmimSettings.Rd b/man/njmimSettings.Rd index 920271039..8458936ed 100644 --- a/man/njmimSettings.Rd +++ b/man/njmimSettings.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/NJMIM.R +% Please edit documentation in R/VariableSelection.R \name{njmimSettings} \alias{njmimSettings} \title{Create settings using normalized joint mutual information maximization (NJMIM) for feature selection} @@ -7,7 +7,7 @@ njmimSettings(k = 20) } \arguments{ -\item{k}{number of features to select} +\item{k}{number of variables to select, default `20`} } \description{ Create settings using normalized joint mutual information maximization (NJMIM) for feature selection diff --git a/man/setIterativeHardThresholding.Rd b/man/setIterativeHardThresholding.Rd index 5d612f80b..315e6840c 100644 --- a/man/setIterativeHardThresholding.Rd +++ b/man/setIterativeHardThresholding.Rd @@ -15,13 +15,15 @@ setIterativeHardThresholding( tolerance = 1e-08, maxIterations = 10000, threshold = 1e-06, - delta = 0 + delta = 0, + nTries = 10 ) } \arguments{ \item{K}{The maximum number of non-zero predictors} -\item{penalty}{Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value} +\item{penalty}{Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value +If set to `auto` it will do CV to determine best penalty} \item{seed}{An option to add a seed when training the model} @@ -31,8 +33,8 @@ setIterativeHardThresholding( \item{fitBestSubset}{Logical: Fit final subset with no regularization} -\item{initialRidgeVariance}{integer or character vector. If set to auto will fit Ridge regression using -cross validation to determine best initialRidgeVariance value.} +\item{initialRidgeVariance}{integer or character vector. If set to `auto` will fit Ridge regression using +cross validation to determine best `initialRidgeVariance` value.} \item{tolerance}{numeric} @@ -41,6 +43,8 @@ cross validation to determine best initialRidgeVariance value.} \item{threshold}{numeric} \item{delta}{numeric} + +\item{nTries}{If `penalty` is `auto`, how many penalties to include in grid search} } \description{ Create setting for lasso logistic regression diff --git a/man/setStepwiseSelection.Rd b/man/setStepwiseSelection.Rd deleted file mode 100644 index db6cb3666..000000000 --- a/man/setStepwiseSelection.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/VariableSelection.R -\name{setStepwiseSelection} -\alias{setStepwiseSelection} -\title{Selects features using stepwise selection} -\usage{ -setStepwiseSelection( - modelSettings, - selectMethod = "backward", - nInitialVariables = 50, - nVariables = 20, - stepSize = 1 -) -} -\arguments{ -\item{modelSettings}{settings of model to use in fit after selecting variables} - -\item{selectMethod}{`backward` or `forward` selection} - -\item{nInitialVariables}{# of variables to select initially} - -\item{nVariables}{amount of variables to select, default `20`} - -\item{stepSize}{How many variables to add/remove in each step} -} -\description{ -Selects features using stepwise selection -} diff --git a/man/setUnivariateSelection.Rd b/man/setUnivariateSelection.Rd deleted file mode 100644 index 7dececa6f..000000000 --- a/man/setUnivariateSelection.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/VariableSelection.R -\name{setUnivariateSelection} -\alias{setUnivariateSelection} -\title{Selects features based on univariate statistics} -\usage{ -setUnivariateSelection( - modelSettings = PatientLevelPrediction::setLassoLogisticRegression(), - corMethod = "pearson", - nVariables = 50 -) -} -\arguments{ -\item{modelSettings}{settings of model to use in fit after selecting variables} - -\item{corMethod}{which type of correlation to use, `pearson`, `kendall` or `spearman`. default `pearson`} - -\item{nVariables}{amount of variables to select, default `50`} -} -\description{ -Selects features based on univariate statistics -} diff --git a/man/stepwiseSettings.Rd b/man/stepwiseSettings.Rd new file mode 100644 index 000000000..c6faaa3eb --- /dev/null +++ b/man/stepwiseSettings.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VariableSelection.R +\name{stepwiseSettings} +\alias{stepwiseSettings} +\title{Create settings using stepwise selection for feature selection} +\usage{ +stepwiseSettings( + k = 20, + selectMethod = "backward", + kStart = 100, + stepSize = 1, + modelSettings = PatientLevelPrediction::setLassoLogisticRegression() +) +} +\arguments{ +\item{k}{number of variables to select, default `20`} + +\item{selectMethod}{`backward` or `forward` selection} + +\item{kStart}{number of variables to select initially} + +\item{stepSize}{How many variables to add/remove in each step} + +\item{modelSettings}{settings of model to use in fit after selecting variables} +} +\description{ +Create settings using stepwise selection for feature selection +} diff --git a/man/univariateSettings.Rd b/man/univariateSettings.Rd new file mode 100644 index 000000000..d6bd1fd45 --- /dev/null +++ b/man/univariateSettings.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VariableSelection.R +\name{univariateSettings} +\alias{univariateSettings} +\title{Create settings using univariate statistics for feature selection} +\usage{ +univariateSettings(k = 20, corMethod = "pearson") +} +\arguments{ +\item{k}{number of variables to select, default `20`} + +\item{corMethod}{which type of correlation to use, `pearson`, `kendall` or `spearman`. default `pearson`} +} +\description{ +Create settings using univariate statistics for feature selection +} diff --git a/tests/testthat/test-cyclopsModels.R b/tests/testthat/test-cyclopsModels.R index 0bbf6d779..a0969f8e1 100644 --- a/tests/testthat/test-cyclopsModels.R +++ b/tests/testthat/test-cyclopsModels.R @@ -196,7 +196,7 @@ test_that("set IHT inputs", { expect_equal(model_set$param$priorParams$K, k) - penalty <- sample(c('bic', 'aic'),1) + penalty <- sample(c('bic', 'aic', 'auto'),1) model_set <- setIterativeHardThresholding(penalty = penalty) expect_equal(model_set$param$priorParams$penalty, penalty) diff --git a/tests/testthat/test-featureEngineering.R b/tests/testthat/test-featureEngineering.R index 04182d4d0..fbb9f1e36 100644 --- a/tests/testthat/test-featureEngineering.R +++ b/tests/testthat/test-featureEngineering.R @@ -40,14 +40,14 @@ test_that("createFeatureEngineeringSettings correct class", { testUniFun <- function(k = 100){ - result <- createUnivariateFeatureSelection(k = k) + result <- univariateSettings(k = k) return(result) } -test_that("createUnivariateFeatureSelection correct class", { +test_that("univariateSettings correct class", { k <- sample(1000,1) featureEngineeringSettings <- testUniFun(k = k) @@ -72,7 +72,7 @@ test_that("univariateFeatureSelection", { reducedTrainData <- univariateFeatureSelection( trainData = newTrainData, featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = NULL + covariateIdsSelected = NULL ) newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() @@ -144,7 +144,8 @@ test_that("randomForestFeatureSelection", { reducedTrainData <- randomForestFeatureSelection( trainData = newTrainData, featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = NULL + covariateIdsSelected = NULL + ) newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() @@ -159,7 +160,7 @@ test_that("featureSelection is applied on test_data", { newTrainData <- univariateFeatureSelection( trainData = newTrainData, featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = NULL + covariateIdsSelected = NULL ) modelSettings <- setLassoLogisticRegression() @@ -176,7 +177,7 @@ test_that("featureSelection is applied on test_data", { } }) -test_that("njmim settings function works", { +test_that("NJMIM settings function works", { k <- sample(1000,1) featureEngineeringSettings <- njmimSettings(k = k) @@ -200,7 +201,7 @@ test_that("NJMIM feature selection works", { reducedTrainData <- njmimFeatureSelection( trainData = newTrainData, featureEngineeringSettings = featureEngineeringSettings, - covariateIdsInclude = NULL + covariateIdsSelected = NULL ) newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() diff --git a/tests/testthat/test-featureImportance.R b/tests/testthat/test-featureImportance.R index 193f43b41..c6cb387c1 100644 --- a/tests/testthat/test-featureImportance.R +++ b/tests/testthat/test-featureImportance.R @@ -19,7 +19,7 @@ context("FeatureImportance") # Test unit for the creation of the study externalValidatePlp -test_that("pfi feature importance returns data.frame", { +test_that("feature importance returns data.frame", { # limit to a sample of 10 covariates for faster test covariates <- plpResult$model$covariateImportance %>% @@ -42,12 +42,12 @@ test_that("pfi feature importance returns data.frame", { }) -test_that('pfi feature importance works with logger or without covariates', { +test_that('feature importance works with logger or without covariates', { tinyResults <- runPlp(plpData = tinyPlpData, populationSettings = populationSettings, outcomeId = 2, analysisId = 'tinyFit', - featureEngineeringSettings = createUnivariateFeatureSelection(k=20), + featureEngineeringSettings = univariateSettings(k=20), executeSettings = createExecuteSettings( runSplitData = T, runSampleData = F,