diff --git a/NAMESPACE b/NAMESPACE index fc2aacc..eaf957a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(candidateNumberExplore) export(modelsCurveExplore) export(predictExplore) export(rocCurveExplore) diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index c657748..2d99ba7 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -78,8 +78,13 @@ saveData <- function(output_path, train_data, file_name) { # Fix col type for binary data binary_cols <- sapply(1:ncol(train_data), function(c) all(train_data[[c]] %in% 0:1)) - train_data[binary_cols] <- lapply(colnames(train_data[binary_cols]), function(c) factor(train_data[[c]], labels=c(0,1))) + + # Convert TRUE/FALSE to 1/0 + train_data <- convert_logical(train_data) + # Order data (first binary then continuous features) + train_data <- cbind(train_data[binary_cols],train_data[!binary_cols]) # Order needed for correct functioning of main algorithm in C++ + # Save data as arff file if (file.exists(paste0(output_path, file_name, ".arff"))) {file.remove(paste0(output_path, file_name, ".arff"))} farff::writeARFF(train_data, paste0(output_path, file_name, ".arff")) @@ -97,9 +102,43 @@ saveData <- function(output_path, train_data, file_name) { # TODO: Support other file formats? } +convert_logical <- function(train_data) { + + binary_cols <- sapply(train_data, function(col) all(col %in% c(0, 1, TRUE, FALSE))) + + # Convert TRUE/FALSE to 1/0 and create factors + train_data[binary_cols] <- lapply(train_data[binary_cols], function(col) { + col <- as.numeric(as.logical(col)) # Convert TRUE/FALSE to 1/0 + factor(col, levels = c(0, 1), labels = c(0, 1)) # Convert to factors + }) + + return(train_data) + +} + # Correlation metric for binary data. jaccard <- function(a, b) { intersection = length(intersect(a, b)) union = length(a) + length(b) - intersection return (intersection/union) -} \ No newline at end of file +} + +phi <- function(a, b) { + contingency_tb <- table(a, b) + + r.sum <- rowSums(contingency_tb) + c.sum <- colSums(contingency_tb) + + total <- sum(r.sum) + r.sum <- r.sum/total + c.sum <- c.sum/total + + v <- prod(r.sum, c.sum) + phi <- (contingency_tb[1,1] / total - c.sum[1] * r.sum[1] / sqrt(v)) + names(phi) <- NULL + + return(phi) +} + + + diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 2e704be..17ff331 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -39,17 +39,17 @@ trainExplore <- function(train_data = NULL, StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, PrintSettings = TRUE, - PrintPerformance = TRUE, - Subsumption = TRUE, + PrintPerformance = FALSE, + Subsumption = FALSE, BranchBound = TRUE, Parallel = FALSE, PrintCutoffSets = TRUE, @@ -101,6 +101,7 @@ trainExplore <- function(train_data = NULL, checkDouble(Accuracy), checkDouble(BalancedAccuracy), checkDouble(Specificity), + checkString(OutputMethod), checkLogical(PrintSettings), checkLogical(PrintPerformance), checkLogical(Subsumption), @@ -121,6 +122,7 @@ trainExplore <- function(train_data = NULL, Subsumption <- ifelse(Subsumption == TRUE, "yes", "no") BranchBound <- ifelse(BranchBound == TRUE, "yes", "no") Parallel <- ifelse(Parallel == TRUE, "yes", "no") + BinaryReduction <- ifelse(BinaryReduction == TRUE, "yes", "no") Accuracy <- ifelse(Accuracy == 0, "", Accuracy) BalancedAccuracy <- ifelse(BalancedAccuracy == 0, "", BalancedAccuracy) Specificity <- ifelse(Specificity == 0, "", Specificity) @@ -155,7 +157,9 @@ trainExplore <- function(train_data = NULL, cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) cor(col, train_data[ClassFeature_]==PositiveClass_, method=Sorted)) } else if (Sorted == "jaccard") { cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) jaccard(col, train_data[ClassFeature_]==PositiveClass_)) - } + } else if (Sorted == "phi") { + cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) phi(col, train_data[ClassFeature_]==PositiveClass_)) + } # else if (Sorted == "LASSO") { # model_lasso <- glmnet::cv.glmnet(x=data.matrix(train_data[, -which(names(train_data) == ClassFeature_)]), y = train_data[ClassFeature_]==PositiveClass_, alpha = 1, lambda = 10^seq(3, -2, by = -.1), maxit=10000000, standardize = TRUE, nfolds = 5, family = "binomial") # coef <- as.matrix(coef(model_lasso, s = "lambda.min")) # get importance @@ -164,7 +168,7 @@ trainExplore <- function(train_data = NULL, # } coef <- names(cor)[order(-abs(cor))] - train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by LASSO importance + train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by importance } saveData(output_path, train_data, file_name) @@ -212,17 +216,18 @@ trainExplore <- function(train_data = NULL, # "cutoff_sets" = cutoff_sets) # Load model - rule_string <- stringr::str_extract(results, "Best candidate \\(overall\\):.*?\u000A") + rule_string <- stringr::str_extract_all(results, "Best candidate:.*?\u000A") + rule_string <- unlist(rule_string)[[length(rule_string)]] # Select the last rule as this is the final candidate # Clean string - rule_string <- stringr::str_replace(rule_string, "Best candidate \\(overall\\):", "") + rule_string <- stringr::str_replace(rule_string, "Best candidate:", "") rule_string <- stringr::str_replace_all(rule_string, " ", "") rule_string <- stringr::str_replace_all(rule_string, "\\n", "") results <- list("model" = rule_string, - "candidate_models" = candidate_models, - "cutoff_sets" = cutoff_sets) + "candidate_models" = candidate_models, + "cutoff_sets" = cutoff_sets) result <- results[resultType] @@ -249,6 +254,7 @@ trainExplore <- function(train_data = NULL, #' @param Maximize One of list with strings, list = "ACCURACY", ... #' @param Accuracy Float 0-1 -> default = 0 (if 0, make empty = computationally more beneficial) #' @param Specificity float 0-1, default = 0 +#' @param OutputMethod string EVERY, BEST, INCREMENT #' @param PrintSettings True or False #' @param PrintPerformance True or False #' @param Subsumption True or False @@ -269,17 +275,18 @@ settingsExplore <- function(settings, ClassFeature, PositiveClass, FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = "yes", - PrintPerformance = "yes", - PrintCutoffSets = "yes", - Subsumption = "yes", + PrintPerformance = "no", + PrintCutoffSets = "no", + Subsumption = "no", BranchBound = "yes", Parallel = "no", - OutputMethod = "EVERY", + ParallelMethod = "TWO", BinaryReduction = "no") { @@ -308,6 +315,7 @@ settingsExplore <- function(settings, settings <- changeSetting(settings, parameter = "Subsumption", input = Subsumption) settings <- changeSetting(settings, parameter = "BranchBound", input = BranchBound) settings <- changeSetting(settings, parameter = "Parallel", input = Parallel) + settings <- changeSetting(settings, parameter = "ParallelMethod", input = ParallelMethod) settings <- changeSetting(settings, parameter = "OutputMethod", input = OutputMethod) settings <- changeSetting(settings, parameter = "BinaryReduction", input = BinaryReduction) @@ -339,6 +347,11 @@ predictExplore <- function(model, test_data) { return(NULL) } + # Clean string + model <- stringr::str_remove_all(model, '\"') + model <- stringr::str_replace_all(model, "=", "==") + model <- stringr::str_replace_all(model, "<=", "<") # to correct initial case <= -> <== -> <= + # Split string all_terms <- stringr::str_split_fixed(model, "OR", n=Inf) @@ -360,12 +373,59 @@ predictExplore <- function(model, test_data) { data_model <- cbind(data_model, as.integer(col==length(all_literals))) } - colnames(data_model) <- all_terms + colnames(data_model) <- all_terms # TODO: CHECK HERE WHY DATA_MODEL NO COLUMNS predictions <- as.integer(rowSums(data_model)>0) return(predictions) } +#' Return a set of results from EXPLORE output file +#' @param outputFile outputfile = paste0(output_path, file_name, ".result") +#' +#' @export +resultsExplore <- function(outputFile) { + + # Read in results file + results <- paste(readLines(outputFile), collapse="\n") + results_lines <- strsplit(results, "\n") %>% unlist() + + result <- list() + + for (line in results_lines) { + # line <- "Candidate model: '198124209' = \"0\"" + if (grepl(":", line)) { + if (grepl("Candidate model", line)) { + split_line <- strsplit(line, ":")[[1]] + key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .) + value <- trimws(split_line[2]) + result[[key]] <- c(result[[key]], value) + } else { + split_line <- strsplit(line, ":")[[1]] + key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .) + value <- trimws(split_line[2]) + result[[key]] <- value + } + } + } + + return(result) +} + +#' Return the number of candidate rules for EXPLORE +#' @param OutputFile output file = paste0(output_path, file_name, ".result") +#' +#' @export +candidateNumberExplore <- function(OutputFile) { + + # Read in results file + results <- paste(readLines(OutputFile), collapse="\n") + + num_candidates <- stringr::str_extract_all(results, "Total Count Candidates \\(incl constraints\\):.*?\u000A")[[1]] + num_candidates <- as.data.frame(stringr::str_remove_all(num_candidates, "Total Count Candidates \\(incl constraints\\):")) + num_candidates <- stringr::str_replace_all(num_candidates, "\\n", "") + + return(as.numeric(num_candidates)) +} #' modelsCurveExplore # TODO: update documentation? #' @@ -386,19 +446,23 @@ modelsCurveExplore <- function(train_data = NULL, StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = TRUE, - PrintPerformance = TRUE, - Subsumption = TRUE, + PrintPerformance = FALSE, + Subsumption = FALSE, BranchBound = TRUE, - Parallel = FALSE) { + Sorted = "none", + Parallel = TRUE, + ParallelMethod = "TWO", + BinaryReduction = FALSE) { # TODO: only input required variables? # Range of specificities to check @@ -418,9 +482,9 @@ modelsCurveExplore <- function(train_data = NULL, ClassFeature = ClassFeature, PositiveClass = PositiveClass, FeatureInclude = FeatureInclude, Maximize = "SENSITIVITY", Accuracy = Accuracy, BalancedAccuracy = BalancedAccuracy, Specificity = constraint, - PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, + OutputMethod = OutputMethod, PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, Subsumption = Subsumption, BranchBound = BranchBound, - Parallel = Parallel) + Parallel = Parallel, ParallelMethod = ParallelMethod) return(model) }) diff --git a/R/testExplore.R b/R/testExplore.R index 872d7e3..054892b 100644 --- a/R/testExplore.R +++ b/R/testExplore.R @@ -1,4 +1,7 @@ -testExplore <- function(dataset = "iris", StartRulelength = 2, EndRulelength = 2, BinaryReduction = FALSE) { +testExplore <- function(dataset = "iris", + StartRulelength = 2, + EndRulelength = 2, + BinaryReduction = FALSE) { # dataset = "iris" # dataset = "binary_3" # dataset = "binary_10" diff --git a/inst/CMakeLists.txt b/inst/CMakeLists.txt index 16bbc7e..344a7a3 100644 --- a/inst/CMakeLists.txt +++ b/inst/CMakeLists.txt @@ -38,9 +38,9 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ltbb") # set(CMAKE_PREFIX_PATH "/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8") # find_library(TBB_LIB tbb) # find_path(TBB_PATH - # HINTS /opt/intel/oneapi/tbb/latest/include - # NAMES tbb/parallel_for.h) - +# HINTS /opt/intel/oneapi/tbb/latest/include +# NAMES tbb/parallel_for.h) + set(SOURCE_FILES Clion/main.cpp ../src/C++/CMExplore/cmdline.h @@ -93,8 +93,7 @@ set(SOURCE_FILES ../src/C++/IOExplore/IOExplore.h ../src/C++/common.cpp ../src/C++/common.h - ../src/C++/stl.h - ../src/C++/stlpmt.lib) + ../src/C++/stl.h) add_executable(Explore ${SOURCE_FILES}) diff --git a/inst/examples/complexity/binary_10.project b/inst/examples/complexity/binary_10.project index 6119d60..551c342 100644 --- a/inst/examples/complexity/binary_10.project +++ b/inst/examples/complexity/binary_10.project @@ -8,7 +8,7 @@ IncrementalOutputFile=false [Setup] PartitionMethod=RESUBSTITUTION Randomize=no -StartRulelength=3 +StartRulelength=1 EndRulelength=3 LearnRatio=0.8 NumberofPartitions=1 @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes +PrintCombinations=no +PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/binary_3.project b/inst/examples/complexity/binary_3.project index 4719a91..65d2bfd 100644 --- a/inst/examples/complexity/binary_3.project +++ b/inst/examples/complexity/binary_3.project @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes +PrintCombinations=no +PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/categorical_4.project b/inst/examples/complexity/categorical_4.project index 725de6b..b4416cf 100644 --- a/inst/examples/complexity/categorical_4.project +++ b/inst/examples/complexity/categorical_4.project @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes -PrintCutoffSets=yes -PrintCutOffsetsBestLength=no +PrintCombinations=no +PrintFeatureSets=no +PrintCutoffSets=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/continuous_4.project b/inst/examples/complexity/continuous_4.project index d6b6618..b70e721 100644 --- a/inst/examples/complexity/continuous_4.project +++ b/inst/examples/complexity/continuous_4.project @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes -PrintCutoffSets=yes -PrintCutOffsetsBestLength=no +PrintCombinations=no +PrintFeatureSets=no +PrintCutoffSets=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/mix_4.project b/inst/examples/complexity/mix_4.project index 15cde8a..3ec382c 100644 --- a/inst/examples/complexity/mix_4.project +++ b/inst/examples/complexity/mix_4.project @@ -25,8 +25,9 @@ FeatureRule= Maximize=BALANCEDACCURACY Accuracy= Specificity= +BalancedAccuracy=0.6268 [Output] -OutputMethod=BEST +OutputMethod=EVERY PrintSettings=yes PrintPartitions=no PrintFeatureOperators=no @@ -34,17 +35,17 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes +PrintCombinations=no +PrintFeatureSets=no PrintCutoffSets=no PrintCutOffsetsBestLength=no -PrintPerformance=yes +PrintPerformance=no PrintSets=no SavePartitions=no [Run] Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/plp/test_plp.project b/inst/examples/plp/test_plp.project index bb3a586..cb7a0a6 100644 --- a/inst/examples/plp/test_plp.project +++ b/inst/examples/plp/test_plp.project @@ -38,7 +38,6 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no diff --git a/inst/examples/test.project b/inst/examples/test.project index 03f0fbb..94832ed 100755 --- a/inst/examples/test.project +++ b/inst/examples/test.project @@ -39,7 +39,6 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no diff --git a/inst/examples/tests/iris.project b/inst/examples/tests/iris.project index 105e39e..e69d898 100644 --- a/inst/examples/tests/iris.project +++ b/inst/examples/tests/iris.project @@ -37,8 +37,7 @@ PrintOperatorMethod=no PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no -PrintCutoffSets=yes -PrintCutOffsetsBestLength=no +PrintCutoffSets=no PrintPerformance=yes PrintSets=yes SavePartitions=no diff --git a/inst/examples/train_data.project b/inst/examples/train_data.project index e1ce687..1d1f2b4 100644 --- a/inst/examples/train_data.project +++ b/inst/examples/train_data.project @@ -37,7 +37,6 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no diff --git a/inst/settings/template.project b/inst/settings/template.project index 7945586..fa612fe 100755 --- a/inst/settings/template.project +++ b/inst/settings/template.project @@ -13,7 +13,7 @@ EndRulelength=3 LearnRatio=0.8 NumberofPartitions=1 OperatorMethod=EXHAUSTIVE -CutoffMethod=RVAC +CutoffMethod=ALL CutoffFile=@CutoffFile ClassFeature=@ClassFeature PositiveClass=@PositiveClass @@ -22,7 +22,7 @@ Rule= FeatureInclude= FeatureRule= [Constraints] -Maximize=ACCURACY +Maximize=BALANCEDACCURACY Accuracy= BalancedAccuracy= Specificity= @@ -38,13 +38,12 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no -PrintPerformance=yes +PrintPerformance=no PrintSets=no SavePartitions=no [Run] -Subsumption=yes +Subsumption=no BranchBound=yes -Parallel=no -ParallelMethod=ONE +Parallel=yes +ParallelMethod=TWO BinaryReduction=no diff --git a/man/candidateNumberExplore.Rd b/man/candidateNumberExplore.Rd new file mode 100644 index 0000000..e9ab07e --- /dev/null +++ b/man/candidateNumberExplore.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MainFunctions.R +\name{candidateNumberExplore} +\alias{candidateNumberExplore} +\title{Return the number of candidate rules for EXPLORE} +\usage{ +candidateNumberExplore(OutputFile) +} +\arguments{ +\item{OutputFile}{output file = paste0(output_path, file_name, ".result")} +} +\description{ +Return the number of candidate rules for EXPLORE +} diff --git a/man/modelsCurveExplore.Rd b/man/modelsCurveExplore.Rd index 0f0b79c..767ec77 100644 --- a/man/modelsCurveExplore.Rd +++ b/man/modelsCurveExplore.Rd @@ -13,19 +13,23 @@ modelsCurveExplore( StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = TRUE, - PrintPerformance = TRUE, - Subsumption = TRUE, + PrintPerformance = FALSE, + Subsumption = FALSE, BranchBound = TRUE, - Parallel = FALSE + Sorted = "none", + Parallel = TRUE, + ParallelMethod = "TWO", + BinaryReduction = FALSE ) } \arguments{ diff --git a/man/settingsExplore.Rd b/man/settingsExplore.Rd index 12bd109..453c30d 100644 --- a/man/settingsExplore.Rd +++ b/man/settingsExplore.Rd @@ -17,17 +17,18 @@ settingsExplore( ClassFeature, PositiveClass, FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = "yes", - PrintPerformance = "yes", - PrintCutoffSets = "yes", - Subsumption = "yes", + PrintPerformance = "no", + PrintCutoffSets = "no", + Subsumption = "no", BranchBound = "yes", Parallel = "no", - OutputMethod = "EVERY", + ParallelMethod = "TWO", BinaryReduction = "no" ) } @@ -62,6 +63,8 @@ settingsExplore( \item{Specificity}{float 0-1, default = 0} +\item{OutputMethod}{string EVERY, BEST, INCREMENT} + \item{PrintSettings}{True or False} \item{PrintPerformance}{True or False} diff --git a/man/trainExplore.Rd b/man/trainExplore.Rd index 80e4e87..7ef2d3f 100644 --- a/man/trainExplore.Rd +++ b/man/trainExplore.Rd @@ -13,23 +13,24 @@ trainExplore( StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, PrintSettings = TRUE, - PrintPerformance = TRUE, - Subsumption = TRUE, + PrintPerformance = FALSE, + Subsumption = FALSE, BranchBound = TRUE, Parallel = FALSE, PrintCutoffSets = TRUE, Sorted = "none", OutputMethod = "EVERY", - BinaryReduction = BinaryReduction + BinaryReduction = FALSE, + resultType = c("model", "candidate_models", "cutoff_sets") ) } \arguments{ diff --git a/src/C++/Explore/candidate.cpp b/src/C++/Explore/candidate.cpp index 987cd7f..e79a71e 100755 --- a/src/C++/Explore/candidate.cpp +++ b/src/C++/Explore/candidate.cpp @@ -40,6 +40,8 @@ void CANDIDATE::Clear() { Performance.Accuracy.Value = 0; Performance.NPV.Value = 0; Performance.PPV.Value = 0; + Performance.BalancedAccuracy.Value = 0; + Performance.F1score.Value = 0; } /********************************************************************** diff --git a/src/C++/Explore/condition.cpp b/src/C++/Explore/condition.cpp index 267cb00..5137b6e 100755 --- a/src/C++/Explore/condition.cpp +++ b/src/C++/Explore/condition.cpp @@ -21,7 +21,9 @@ CONDITION::CONDITION() { // Needed for cutoffsets IsSolo = false; + IsRepeated = false; NonSoloIncluded = false; + RepeatedFeature = false; NextSame = false; PreviousSame = false; @@ -49,7 +51,9 @@ CONDITION::CONDITION(unsigned int CNumber, string CName, vector CCutoffs // Needed for cutoffsets IsSolo = false; + IsRepeated = false; NonSoloIncluded = false; + RepeatedFeature = false; NextSame = false; PreviousSame = false; diff --git a/src/C++/Explore/condition.h b/src/C++/Explore/condition.h index abe68af..fd29c13 100755 --- a/src/C++/Explore/condition.h +++ b/src/C++/Explore/condition.h @@ -42,7 +42,11 @@ class CONDITION { // FeatureOperator bool IsSolo; // FeatureOperator occurs within rule on it's own bool NonSoloIncluded; // FeatureOperator included in a conjunction with size>1 - + + // Feature + bool IsRepeated; + bool RepeatedFeature; + // Condition bool PreviousSame; // Feature is equal to previous feature within conjunction (left one) bool NextSame; // Feature is equal to next feature (right one) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index 22bd7e5..10e370a 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -13,6 +13,7 @@ #include #include #include +#include std::mutex m0; std::mutex m1; @@ -232,50 +233,40 @@ void Explore::ValidateBestCandidate() { Start = clock(); #endif - if (Initialised) { // TODO: adjust this to train instead of split validation/learn + if (Initialised) { if (!Final) { Rule.SetTestMode(VALIDATION); } else { - Rule.SetTestMode(LEARN); // TODO: don't need new type train, remove? + Rule.SetTestMode(LEARN); // HERE: VALIDATE is also part of LEARN } - // cout << endl << "BEST RULES (" << PartitionCandidates.size() << " candidates)" << endl << endl; - cout << endl << "BEST RULES" << endl << endl; - - if(PartitionCandidates.IsValid()) { + if (PartitionCandidates.IsValid()) { CANDIDATE BestCandidate = PartitionCandidates; - // for (unsigned int i=GetMinRuleLength(); i<=GetMaxRuleLength(); i++){ - cout << "RULELENGTH " << BestCandidate.Size() << endl << endl; - // if (ChooseBestCandidate(i)){ + cout << endl; + cout << "Best length: " << BestCandidate.Size() << endl; if (Rule.SetRule(BestCandidate)) { - cout << "Best candidate (overall): "; + cout << "Best candidate: "; Rule.PrintCutoffSet(); - cout << endl; - cout << "Learn-set: "; + cout << "Performance learn-set: "; BestCandidate.Performance.Print(); - cout << endl; - if (!(GetPartitionMethod()==RESUBSTITUTION)){ + if (!(GetPartitionMethod()==RESUBSTITUTION) && !Final){ BestCandidate.Performance = Rule.CalculatePerformance(); // Test BestCandidate on validation partition - cout << "Validation-set: "; + cout << "Performance validation-set: "; BestCandidate.Performance.Print(); cout << endl; } ProjectCandidates.push_back(BestCandidate); - } -// } else { -// cout << "None." << endl << endl; -// } - // } - PartitionCandidates.Clear(); } else { #if defined(EXPLORE_MPI_DEBUG) cout << "--> No Candidates" << endl; #endif } + + PartitionCandidates.Clear(); } #ifdef DEBUG_TIMING @@ -406,12 +397,12 @@ void Explore::SummarisePerformance() { case ACCURACY: CurrentValue = (*CurrentCandidate).Performance.Accuracy.Value; break; - case BALANCEDACCURACY: - CurrentValue = (*CurrentCandidate).Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CurrentValue = (*CurrentCandidate).Performance.F1score.Value; - break; + case BALANCEDACCURACY: + CurrentValue = (*CurrentCandidate).Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CurrentValue = (*CurrentCandidate).Performance.F1score.Value; + break; } // Calculate minimum performance if (CurrentValue0) { cout << Name << endl; } else { @@ -809,7 +800,7 @@ void Explore::PrintPerformance() { if (Initialised) { CurrentPerformance.Print(); } - + #ifdef DEBUG_TIMING End = clock(); ExploreTiming.AddTime("EXPLORE::PrintPerformance", Start, End); @@ -846,12 +837,12 @@ void Explore::PrintConstraints() { case ACCURACY: cout << "Accuracy"; break; - case BALANCEDACCURACY: - cout << "Balanced Accuracy"; - break; - case F1SCORE: - cout << "F1 score"; - break; + case BALANCEDACCURACY: + cout << "Balanced Accuracy"; + break; + case F1SCORE: + cout << "F1 score"; + break; } cout << endl; @@ -872,12 +863,12 @@ void Explore::PrintConstraints() { case ACCURACY: cout << "Accuracy >= "; break; - case BALANCEDACCURACY: - cout << "Balanced Accuracy >= "; - break; - case F1SCORE: - cout << "F1 score >= "; - break; + case BALANCEDACCURACY: + cout << "Balanced Accuracy >= "; + break; + case F1SCORE: + cout << "F1 score >= "; + break; } cout << (*CurrentConstraint).Value << endl; } @@ -1157,7 +1148,7 @@ Out: - Description: Partition the population of explore. **********************************************************************/ bool Explore::Partition() { - // ValidateBestCandidate(); // Do not remove! Is needed for summarising best candidates at the end of projects (ie. HOLDOUT) + ValidateBestCandidate(); // Do not remove! Is needed for summarising best candidates at the end of projects (ie. HOLDOUT) if (Population.Partition()) { // Will return false with holdout on second call! SetRerun(); // Reset rule (findcutoffs etc) @@ -1227,10 +1218,8 @@ bool Explore::Initialise() { // Print project settings if (IsPrintSettings) { - PrintSettings(); - } - if (IsPrintPerformance) { - PrintConstraints(); + PrintSettings(); + PrintConstraints(); } if (IsPrintFeatureOperators) { // Print FeatureOperators @@ -1242,7 +1231,7 @@ bool Explore::Initialise() { End = clock(); ExploreTiming.AddTime("EXPLORE::Initialise", Start, End); #endif - + return Initialised; } @@ -1277,7 +1266,7 @@ bool Explore::LimitedInitialise() { NoPartitionsDone = 0; RulesProcessed = 0; - FeatureSetsProcessed = 0; + FeatureSetsProcessed = 0; //PR ExploreComplexity = RuleComplexity(); // Calculate complexity for progress indication ExploreComplexity = Population.GetNoPartitions()*10; InitialiseCPFP(); @@ -1292,7 +1281,7 @@ bool Explore::LimitedInitialise() { End = clock(); ExploreTiming.AddTime("EXPLORE::Initialise", Start, End); #endif - + return Initialised; } @@ -1416,7 +1405,7 @@ Function: GetSeed() Category: Selectors Scope: public In: - -Out: long double, the seed which is a cast from time_t +Out: long double, the seed which is a cast from time_t Description: Get the seed used to randomize the population. **********************************************************************/ long double Explore::GetSeed() { @@ -1717,7 +1706,7 @@ bool Explore::RemoveCutoff(unsigned int FeatureNumber, string CutoffValue) { Population.RemoveCutoff(FeatureNumber, CutoffValue); // Remove the cutoff return true; } - + return false; } @@ -1743,7 +1732,7 @@ bool Explore::RemoveCutoffRange(unsigned int FeatureNumber) { Function: RemoveFeatureCutoffs() Category: Modifiers Scope: public -In: +In: Out: bool, could/could not remove cutoffs for selected feature Description: Remove all cutoffs belonging to a specific feature. **********************************************************************/ @@ -1752,7 +1741,7 @@ bool Explore::RemoveFeatureCutoffs(unsigned int FeatureNumber) { Population.RemoveFeatureCutoffs(FeatureNumber); return true; } - + return false; } @@ -1921,7 +1910,7 @@ Function: GetOperatorMethod() Category: Selectors Scope: public In: - -Out: OPERATOR_METHOD, the operator method +Out: OPERATOR_METHOD, the operator method Description: Returns the method to determine operators of each feature. **********************************************************************/ OPERATOR_METHOD Explore::GetOperatorMethod() { @@ -2060,19 +2049,6 @@ bool Explore::GetPrintCutoffSets() { return IsPrintCutoffSets; } -/********************************************************************** -Function: GetPrintCutoffSets() -Category: Selectors -Scope: public -In: - -Out: bool -Description: -**********************************************************************/ -bool Explore::GetPrintCutoffSetsBestLength() { - return IsPrintCutoffSetsBestLength; -} - - /********************************************************************** Function: GetPrintPerformance() Category: Selectors @@ -2489,19 +2465,6 @@ void Explore::SetPrintCutoffSets(bool Setting) { Rule.SetPrintCutoffSets(Setting); } -/********************************************************************** -Function: SetPrintconditionSets() -Category: Modifiers -Scope: public -In: bool, yes or no -Out: - -Description: Indicate whether explore has to cout conditionsets. -**********************************************************************/ -void Explore::SetPrintCutoffSetsBestLength(bool Setting) { - IsPrintCutoffSetsBestLength = Setting; - Rule.SetPrintCutoffSetsBestLength(Setting); -} - /********************************************************************** Function: SetPrintPerformance() Category: Modifiers @@ -2654,7 +2617,7 @@ void Explore::Start() { cout << endl << "TIMING" << endl << endl; time(&endtime); cout << "Project end: " << ctime(&endtime) << endl; - + std::stringstream sstr; sstr << "RuleLength:" << Rule.GetMaxRuleLength(); RuleLengthTiming.Clear(); @@ -2910,14 +2873,18 @@ bool Explore::RunProject() { unsigned int Partitionnr = 0; time_t dummy; unsigned int ActiveRuleLength; - int CountCandidatesPartition; - int CountFeatureOperatorPairs; - int CountCutoffSets; + + unsigned int CountCandidatesPartition; + unsigned int CountFeatureOperatorPairs; + unsigned int CountCutoffSets; + + CANDIDATE BestCandidate; int BestLengthPartition; int BestLengthFinal = 0; vector BestLength(Rule.GetMaxRuleLength()); - int CPBest_global = 0; - int CTBest_global = 0; + + int CPBest_global; + int CTBest_global; TIMING TermTupleTiming; clock_t StartTimeTermTuple; @@ -2929,7 +2896,6 @@ bool Explore::RunProject() { #endif do { - // CountCombinations = 0; CountFeatureOperatorPairs = 0; CountCutoffSets = 0; CountCandidatesPartition = 0; @@ -2948,263 +2914,255 @@ bool Explore::RunProject() { if (IsPrintCutoffMethod) Population.PrintCutoffMethod(); if (IsPrintCutoffValues) Population.PrintCutoffs(); -if (!Parallel) { - float CandidatePerformance; - int FOSets_old = 0; - - while (Rule.NextCombinationGenerator()) { - // cout << "FO pairs: " << CountFeatureOperatorPairs - FOSets_old << endl; - // FOSets_old = CountFeatureOperatorPairs; - - cout << "Cutoff sets: " << CountCutoffSets - FOSets_old << endl; - FOSets_old = CountCutoffSets; - - if (IsPrintCombinations) Rule.PrintCombination(); - - StartTimeTermTuple = clock(); - - while (Rule.NextFeatureSetGenerator(0, Rule.GetFeatureOperatorSize())) { - - - if (IsPrintFeatureSets) Rule.PrintFeatureSet(); - CountFeatureOperatorPairs++; - - // CalculateProgress(); - while (Rule.NextCutoffSetGenerator()) { - - switch (MaximizeMeasure) { - case SENSITIVITY: - CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; - break; - case NPV: - CandidatePerformance = PartitionCandidates.Performance.NPV.Value; - break; - case PPV: - CandidatePerformance = PartitionCandidates.Performance.PPV.Value; - break; - case ACCURACY: - CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; - break; - case BALANCEDACCURACY: - CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CandidatePerformance = PartitionCandidates.Performance.F1score.Value; - break; - } - - if (Rule.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, RestrictionSet, - RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { - - PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); - } - - // TODO: check if inside or outside TestRule - if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set - cout << "Candidate model: "; - Rule.PrintCutoffSet(); - } - - CountCutoffSets++; - // if (IsUpdateRealtime) CalculateProgress(); + if (!Parallel) { + float CandidatePerformance = 0; + int count = 0; + + while (Rule.NextCombinationGenerator()) { + // cout << "FO pairs: " << CountFeatureOperatorPairs - count << endl; + // count = CountFeatureOperatorPairs; + + // cout << "Cutoff sets: " << CountCutoffSets - count << endl; + // count = CountCutoffSets; + + if (IsPrintCombinations) Rule.PrintCombination(); + + StartTimeTermTuple = clock(); + + while (Rule.NextFeatureSetGenerator(0, Rule.GetFeatureOperatorSize())) { + + if (IsPrintFeatureSets) Rule.PrintFeatureSet(); + CountFeatureOperatorPairs++; + + while (Rule.NextCutoffSetGenerator()) { + + switch (MaximizeMeasure) { + case SENSITIVITY: + CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; + break; + case SPECIFICITY: + CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; + break; + case NPV: + CandidatePerformance = PartitionCandidates.Performance.NPV.Value; + break; + case PPV: + CandidatePerformance = PartitionCandidates.Performance.PPV.Value; + break; + case ACCURACY: + CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; + break; + case BALANCEDACCURACY: + CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CandidatePerformance = PartitionCandidates.Performance.F1score.Value; + break; + } + + if (Rule.TestRule(Initialised, Constraints, + CandidatePerformance, MaximizeMeasure, RestrictionSet, + RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { + + PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); + } + + if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set + cout << "Candidate model: "; + Rule.PrintCutoffSet(); + } + + CountCutoffSets++; + // if (IsUpdateRealtime) CalculateProgress(); #ifndef COMMANDVERSION - // BreatheCount++;// Increment breathe counter - - if (BreatheCount>BREATHE_INTERVAL) { - if (PauseFunction()) { // User paused the project - PrintSummary(); - return false; - } - if (CancelFunction()) { // User cancelled the project - PrintSummary(); - CloseFunction(); - return false; - } - BreatheCount = 0; - } + // BreatheCount++;// Increment breathe counter + + if (BreatheCount>BREATHE_INTERVAL) { + if (PauseFunction()) { // User paused the project + PrintSummary(); + return false; + } + if (CancelFunction()) { // User cancelled the project + PrintSummary(); + CloseFunction(); + return false; + } + BreatheCount = 0; + } #endif - } + } + } + // } + + // std::stringstream sstr; + // TermTupleTiming.Clear(); + // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); + CountCandidatesPartition += Rule.GetCountCandidates(); + Rule.ResetCountCandidates(); } -// std::stringstream sstr; -// TermTupleTiming.Clear(); -// TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); -// Rule.PrintCombination(); -// cout << TermTupleTiming.PrintTotal(); -// cout << "Candidates: " << Rule.GetCountCandidates() << endl << endl; - CountCandidatesPartition += Rule.GetCountCandidates(); - Rule.ResetCountCandidates(); - } + } else { -} else { + CPBest_global = 0; + CTBest_global = 0; - if (ParallelMethod == ONE) { + if (ParallelMethod == ONE) { - vector all_rules; - while(Rule.NextCombinationGenerator()) { - all_rules.push_back(this->Rule); - } + vector all_rules; + while (Rule.NextCombinationGenerator()) { + all_rules.push_back(this->Rule); + } - tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { - for (int i = r.begin(); i < r.end(); ++i) { - StartTimeTermTuple = clock(); - - RULE Rule_i = RULE(all_rules[i]); // CREATE DEEP COPY - - float CandidatePerformance; - CANDIDATE PotentialCandidate; - - m2.lock(); - - Rule_i.CPBest = CPBest_global; - Rule_i.CTBest = CTBest_global; - - switch (MaximizeMeasure) { - case SENSITIVITY: - CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; - break; - case NPV: - CandidatePerformance = PartitionCandidates.Performance.NPV.Value; - break; - case PPV: - CandidatePerformance = PartitionCandidates.Performance.PPV.Value; - break; - case ACCURACY: - CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; - break; - case BALANCEDACCURACY: - CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CandidatePerformance = PartitionCandidates.Performance.F1score.Value; - break; - } - m2.unlock(); - - if (IsPrintCombinations) Rule_i.PrintCombination(); - - // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_i, tbb::this_task_arena::current_thread_index()); - - while (Rule_i.NextFeatureSetGenerator(0, Rule_i.GetFeatureOperatorSize())) { - if (IsPrintFeatureSets) Rule_i.PrintFeatureSet_Thread(); - // CalculateProgress(); - m0.lock(); - CountFeatureOperatorPairs++; - m0.unlock(); - - while (Rule_i.NextCutoffSetGenerator()) { - - if (Rule_i.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, RestrictionSet, - RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { - - PotentialCandidate = Rule_i.SaveCandidate(MaximizeMeasure, - RestrictionSet); - m2.lock(); - - bool change; - switch (MaximizeMeasure) { - case SENSITIVITY: - change = (PotentialCandidate.Performance.Sensitivity.Value > PartitionCandidates.Performance.Sensitivity.Value); - break; - case SPECIFICITY: - change = (PotentialCandidate.Performance.Specificity.Value > PartitionCandidates.Performance.Specificity.Value); - break; - case NPV: - change = (PotentialCandidate.Performance.NPV.Value > PartitionCandidates.Performance.NPV.Value); - break; - case PPV: - change = (PotentialCandidate.Performance.PPV.Value > PartitionCandidates.Performance.PPV.Value); - break; - case ACCURACY: - change = (PotentialCandidate.Performance.Accuracy.Value > PartitionCandidates.Performance.Accuracy.Value); - break; - case BALANCEDACCURACY: - change = (PotentialCandidate.Performance.BalancedAccuracy.Value > PartitionCandidates.Performance.BalancedAccuracy.Value); - break; - case F1SCORE: - change = (PotentialCandidate.Performance.F1score.Value > PartitionCandidates.Performance.F1score.Value); - break; - } + tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { + for (int i = r.begin(); i < r.end(); ++i) { + StartTimeTermTuple = clock(); + + RULE Rule_i = RULE(all_rules[i]); // CREATE DEEP COPY + + float CandidatePerformance; + CANDIDATE PotentialCandidate; + + m2.lock(); + + Rule_i.CPBest = CPBest_global; + Rule_i.CTBest = CTBest_global; + + switch (MaximizeMeasure) { + case SENSITIVITY: + CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; + break; + case SPECIFICITY: + CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; + break; + case NPV: + CandidatePerformance = PartitionCandidates.Performance.NPV.Value; + break; + case PPV: + CandidatePerformance = PartitionCandidates.Performance.PPV.Value; + break; + case ACCURACY: + CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; + break; + case BALANCEDACCURACY: + CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CandidatePerformance = PartitionCandidates.Performance.F1score.Value; + break; + } + m2.unlock(); + + if (IsPrintCombinations) Rule_i.PrintCombination(); + + // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_i, tbb::this_task_arena::current_thread_index()); + + while (Rule_i.NextFeatureSetGenerator(0, Rule_i.GetFeatureOperatorSize())) { + if (IsPrintFeatureSets) Rule_i.PrintFeatureSet_Thread(); + + m0.lock(); + CountFeatureOperatorPairs++; + m0.unlock(); + + while (Rule_i.NextCutoffSetGenerator()) { + + if (Rule_i.TestRule(Initialised, Constraints, + CandidatePerformance, MaximizeMeasure, RestrictionSet, + RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { + + PotentialCandidate = Rule_i.SaveCandidate(MaximizeMeasure, + RestrictionSet); + m2.lock(); + + bool change; + switch (MaximizeMeasure) { + case SENSITIVITY: + change = (PotentialCandidate.Performance.Sensitivity.Value > + PartitionCandidates.Performance.Sensitivity.Value); + break; + case SPECIFICITY: + change = (PotentialCandidate.Performance.Specificity.Value > + PartitionCandidates.Performance.Specificity.Value); + break; + case NPV: + change = (PotentialCandidate.Performance.NPV.Value > + PartitionCandidates.Performance.NPV.Value); + break; + case PPV: + change = (PotentialCandidate.Performance.PPV.Value > + PartitionCandidates.Performance.PPV.Value); + break; + case ACCURACY: + change = (PotentialCandidate.Performance.Accuracy.Value > + PartitionCandidates.Performance.Accuracy.Value); + break; + case BALANCEDACCURACY: + change = (PotentialCandidate.Performance.BalancedAccuracy.Value > + PartitionCandidates.Performance.BalancedAccuracy.Value); + break; + case F1SCORE: + change = (PotentialCandidate.Performance.F1score.Value > + PartitionCandidates.Performance.F1score.Value); + break; + } - if (change) { - PartitionCandidates = PotentialCandidate; + if (change) { + PartitionCandidates = PotentialCandidate; - CPBest_global = Rule_i.CPBest; - CTBest_global = Rule_i.CTBest; + CPBest_global = Rule_i.CPBest; + CTBest_global = Rule_i.CTBest; + } + m2.unlock(); } - m2.unlock(); - } - - // TODO: check if inside or outside TestRule - if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set - cout << "Candidate model: "; - Rule_i.PrintCutoffSet(); - } - m1.lock(); - CountCutoffSets++; - m1.unlock(); - // if (IsUpdateRealtime) CalculateProgress(); -#ifndef COMMANDVERSION - // BreatheCount++;// Increment breathe counter - - if (BreatheCount>BREATHE_INTERVAL) { - if (PauseFunction()) { // User paused the project - PrintSummary(); - return false; - } - if (CancelFunction()) { // User cancelled the project - PrintSummary(); - CloseFunction(); - return false; - } - BreatheCount = 0; + if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set + cout << "Candidate model: "; + Rule_i.PrintCutoffSet(); } -#endif + m1.lock(); + CountCutoffSets++; + m1.unlock(); + // if (IsUpdateRealtime) CalculateProgress(); + + #ifndef COMMANDVERSION + // BreatheCount++;// Increment breathe counter + + if (BreatheCount>BREATHE_INTERVAL) { + if (PauseFunction()) { // User paused the project + PrintSummary(); + return false; + } + if (CancelFunction()) { // User cancelled the project + PrintSummary(); + CloseFunction(); + return false; + } + BreatheCount = 0; + } + #endif + } } + + // std::stringstream sstr; + // TermTupleTiming.Clear(); + // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); + m3.lock(); + CountCandidatesPartition += Rule_i.GetCountCandidates(); + m3.unlock(); } + }); + } else if (ParallelMethod == TWO) { -// std::stringstream sstr; -// TermTupleTiming.Clear(); -// TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); -// Rule_i.PrintCombination(); -// cout << TermTupleTiming.PrintTotal(); -// cout << "Candidates: " << Rule_i.GetCountCandidates() << endl << endl; - m3.lock(); - CountCandidatesPartition += Rule_i.GetCountCandidates(); - m3.unlock(); + vector all_rules; + while (Rule.NextCombinationGenerator()) { + all_rules.push_back(this->Rule); } - // } - }); - } else if (ParallelMethod == TWO) { - - vector all_rules; - while(Rule.NextCombinationGenerator()) { - all_rules.push_back(this->Rule); - } - tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { - - // for (int i = r.begin(); i < r.end(); ++i) - int i = r.begin(); + tbb::parallel_for(tbb::blocked_range2d(0, Rule.GetCombinationsGenerated(), 0, Rule.GetFeatureOperatorSize()),[all_rules, &CPBest_global, &CTBest_global, this, &CountFeatureOperatorPairs, &CountCutoffSets, &CountCandidatesPartition] (const tbb::blocked_range2d &r) { - StartTimeTermTuple = clock(); - - // NOTE: blocked_range uses open interval [start,end) - tbb::parallel_for(tbb::blocked_range(0, Rule.GetFeatureOperatorSize() + 1), [&](tbb::blocked_range s) { + for (int i = r.rows().begin(); i < r.rows().end(); ++i) { - // for (int j = s.begin(); j < s.end(); j++) - { - int j = s.begin(); + for (int j = r.cols().begin(); j < r.cols().end(); j++) { RULE Rule_ij = RULE(all_rules[i]); // CREATE DEEP COPY @@ -3247,46 +3205,58 @@ if (!Parallel) { while (Rule_ij.NextFeatureSetGenerator(j, j)) { // TODO: create function that releases all print statements of one thread at once - + if (IsPrintFeatureSets) Rule_ij.PrintFeatureSet_Thread(); m0.lock(); CountFeatureOperatorPairs++; m0.unlock(); - // CalculateProgress(); while (Rule_ij.NextCutoffSetGenerator()) { if (Rule_ij.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, RestrictionSet, - RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { + CandidatePerformance, MaximizeMeasure, + RestrictionSet, + RuleOutputMethod, IsPrintPerformance, + IsPrintSets)) { - PotentialCandidate = Rule_ij.SaveCandidate(MaximizeMeasure, RestrictionSet); + PotentialCandidate = Rule_ij.SaveCandidate(MaximizeMeasure, + RestrictionSet); m2.lock(); bool change; switch (MaximizeMeasure) { case SENSITIVITY: - change = (PotentialCandidate.Performance.Sensitivity.Value > PartitionCandidates.Performance.Sensitivity.Value); + change = ( + PotentialCandidate.Performance.Sensitivity.Value > + PartitionCandidates.Performance.Sensitivity.Value); break; case SPECIFICITY: - change = (PotentialCandidate.Performance.Specificity.Value > PartitionCandidates.Performance.Specificity.Value); + change = ( + PotentialCandidate.Performance.Specificity.Value > + PartitionCandidates.Performance.Specificity.Value); break; case NPV: - change = (PotentialCandidate.Performance.NPV.Value > PartitionCandidates.Performance.NPV.Value); + change = (PotentialCandidate.Performance.NPV.Value > + PartitionCandidates.Performance.NPV.Value); break; case PPV: - change = (PotentialCandidate.Performance.PPV.Value > PartitionCandidates.Performance.PPV.Value); + change = (PotentialCandidate.Performance.PPV.Value > + PartitionCandidates.Performance.PPV.Value); break; case ACCURACY: - change = (PotentialCandidate.Performance.Accuracy.Value > PartitionCandidates.Performance.Accuracy.Value); + change = (PotentialCandidate.Performance.Accuracy.Value > + PartitionCandidates.Performance.Accuracy.Value); break; case BALANCEDACCURACY: - change = (PotentialCandidate.Performance.BalancedAccuracy.Value > PartitionCandidates.Performance.BalancedAccuracy.Value); + change = ( + PotentialCandidate.Performance.BalancedAccuracy.Value > + PartitionCandidates.Performance.BalancedAccuracy.Value); break; case F1SCORE: - change = (PotentialCandidate.Performance.F1score.Value > PartitionCandidates.Performance.F1score.Value); + change = (PotentialCandidate.Performance.F1score.Value > + PartitionCandidates.Performance.F1score.Value); break; } @@ -3299,7 +3269,6 @@ if (!Parallel) { m2.unlock(); } - // TODO: check if inside or outside TestRule if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set cout << "Candidate model: "; Rule_ij.PrintCutoffSet(); @@ -3312,51 +3281,54 @@ if (!Parallel) { #ifndef COMMANDVERSION // BreatheCount++;// Increment breathe counter - if (BreatheCount>BREATHE_INTERVAL) { - if (PauseFunction()) { // User paused the project - PrintSummary(); - return false; - } - if (CancelFunction()) { // User cancelled the project - PrintSummary(); - CloseFunction(); - return false; - } - BreatheCount = 0; - } + if (BreatheCount>BREATHE_INTERVAL) { + if (PauseFunction()) { // User paused the project + PrintSummary(); + return false; + } + if (CancelFunction()) { // User cancelled the project + PrintSummary(); + CloseFunction(); + return false; + } + BreatheCount = 0; + } #endif } } -// std::stringstream sstr; -// TermTupleTiming.Clear(); -// TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); -// Rule_ij.PrintCombination(); -// cout << TermTupleTiming.PrintTotal(); -// cout << "Candidates: " << Rule_ij.GetCountCandidates() << endl << endl; - m3.lock(); - CountCandidatesPartition += Rule_ij.GetCountCandidates(); - m3.unlock(); - } - }); - } - }); + // std::stringstream sstr; + // TermTupleTiming.Clear(); + // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); + m3.lock(); + CountCandidatesPartition += Rule_ij.GetCountCandidates(); + m3.unlock(); + } + } + + }); + } } -} -// TODO: is "Rule" needed? - BestLengthPartition = PartitionCandidates.Size(); // TODO: for multiple projectcandidates? + if (PartitionCandidates.IsValid()) { + BestCandidate = PartitionCandidates; + BestLengthPartition = BestCandidate.Size(); + +// cout << "Total Count Combinations:" << Rule.GetCombinationsGenerated() << endl; +// cout << "Total Count Feature Operator Pairs:" << CountFeatureOperatorPairs << endl; +// cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition with restrictions (mandatory features) without constraints (accuracy/sensitivity) +// cout << "Total Count Candidates (incl constraints):" << CountCandidatesPartition << endl; +// cout << endl; - cout << endl << endl; - cout << "Best Length:" << BestLengthPartition << endl; - cout << "====================================================" << endl; + Rule.SetRule(BestCandidate); // Needed for parallel? - if (BestLengthPartition != 0) { - // BestLengthPartition = Rule.FindBestLength(Initialised,PartitionCandidates, PartitionMethod, MaximizeMeasure); - // BestLength[BestLengthPartition - 1] = BestLength[BestLengthPartition - 1] + 1; // Calculate performance of current rule in validation set + // Update counter of best length BestLength.at(BestLengthPartition - 1) = BestLength.at(BestLengthPartition - 1) + 1; + + } else { + BestLengthPartition = 0; } } while (Partition()); @@ -3364,62 +3336,43 @@ if (!Parallel) { auto MostFrequent = std::max_element(BestLength.begin(), BestLength.end()); BestLengthFinal = std::distance(std::begin(BestLength), MostFrequent) + 1; - cout << "Results EXPLORE with BestLength " << BestLengthFinal << " on full train set" << endl; + cout << "====================================================" << endl; + cout << endl; + + cout << "RESULT: full train set" << endl; + cout << endl; + if ((GetPartitionMethod())==CROSS_VALIDATION || (GetPartitionMethod())==HOLDOUT) { // Re-train model with full train set (learn + validate) Population.ResetTestPartitions(); // Sets all partitions to LEARN - // PartitionCandidates.clear(); // Remove all the partition candidates used to find BestLength - PartitionCandidates.Clear(); + PartitionCandidates.Clear(); // Remove all the partition candidates used to find BestLength SetRerun(); - Induce(BestLengthFinal, BestLengthFinal); + Induce(BestLengthFinal, BestLengthFinal); // TODO: need to support running in parallel Final = true; ValidateBestCandidate(); // Print results on full train set and save best rule } else { - -#ifndef PARALLEL // Directly print results on full train set and save best rule - if (PartitionCandidates.IsValid()) { - // if (PartitionCandidates.size()>0) { - CANDIDATE BestCandidate = PartitionCandidates; - // CANDIDATE BestCandidate = Rule.ChooseBestCandidate(BestLengthFinal, Initialised, PartitionCandidates, MaximizeMeasure); - // if (ChooseBestCandidate(BestLengthFinal)){ - if (Rule.SetRule(BestCandidate)) { - cout << "Best candidate (overall): "; - Rule.PrintCutoffSet(); - cout << endl; - cout << "Learn-set: "; - BestCandidate.Performance.Print(); - cout << endl; - - ProjectCandidates.push_back(BestCandidate); - - cout << "Total Count Combinations:" << Rule.GetCombinationsGenerated() << endl; - cout << "Total Count Feature Operator Pairs:" << CountFeatureOperatorPairs << endl; - cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition - // cout << "Total Count Candidates:" << CountCandidatesPartition << endl; - - } - // } - } -# else - Population.ResetTestPartitions(); // Sets all partitions to LEARN - PartitionCandidates.clear(); // Remove all the partition candidates used to find BestLength - - SetRerun(); - - Induce(BestLengthFinal, BestLengthFinal); - - Final = true; - ValidateBestCandidate(); // Print results on full train set and save best rule - -# endif - + cout << "Best length: " << BestLengthPartition << endl; + if (Rule.SetRule(BestCandidate)) { + cout << "Best candidate: "; + Rule.PrintCutoffSet(); + cout << "Performance learn-set: "; + BestCandidate.Performance.Print(); + + ProjectCandidates.push_back(BestCandidate); + } } + cout << endl; + cout << "Total Count Combinations:" << Rule.GetCombinationsGenerated() << endl; + cout << "Total Count Feature Operator Pairs:" << CountFeatureOperatorPairs << endl; + cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition with restrictions (mandatory features) without constraints (accuracy/sensitivity) + cout << "Total Count Candidates (incl constraints):" << CountCandidatesPartition << endl; + return true; } @@ -3477,12 +3430,6 @@ void Explore::Induce(int nStart, int nEnd) { PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); } - - if (IsPrintCutoffSetsBestLength) { - cout << "Candidate model BestLength: "; - Rule.PrintCutoffSet(); - } - #ifndef COMMANDVERSION // BreatheCount++; // Increment breathe counter if (BreatheCount>BREATHE_INTERVAL) { @@ -3502,7 +3449,6 @@ void Explore::Induce(int nStart, int nEnd) { } } - cout << "Total Count Candidates:" << Rule.GetCountCandidates() << endl; } diff --git a/src/C++/Explore/explore.h b/src/C++/Explore/explore.h index e4ec8e7..183bd59 100755 --- a/src/C++/Explore/explore.h +++ b/src/C++/Explore/explore.h @@ -107,7 +107,6 @@ class Explore { bool IsPrintCombinations; // Print combinations to output bool IsPrintFeatureSets; // Print featuresets to output bool IsPrintCutoffSets; // Print conditionsets to output - bool IsPrintCutoffSetsBestLength; bool IsPrintPerformance; // Print performance to output bool IsPrintSets; // Print sets to output bool IsPrintOperatorMethod; // Print operator-method information to output @@ -211,7 +210,6 @@ class Explore { bool GetPrintCombinations(); // Should combinations be printed to output bool GetPrintFeatureSets(); // Should featuresets be printed to output bool GetPrintCutoffSets(); // Should conditionsets be printed to output - bool GetPrintCutoffSetsBestLength(); // Should conditionsets be printed to output bool GetPrintPerformance(); // Should performance be printed to output bool GetPrintSets(); // Should sets be printed to output bool GetPrintOperatorMethod(); // Should operator-method information be printed to output @@ -258,7 +256,6 @@ class Explore { void SetPrintCombinations(bool Setting); // Print combinations to output void SetPrintFeatureSets(bool Setting); // Print featuresets to output void SetPrintCutoffSets(bool Setting); // Print conditionsets to output - void SetPrintCutoffSetsBestLength(bool Setting); // Print conditionsets to output void SetPrintPerformance(bool Setting); // Print performance to output void SetPrintSets(bool Setting); // Print sets to output void SetPrintOperatorMethod(bool Setting); // Print operator-method information to output diff --git a/src/C++/Explore/feature.h b/src/C++/Explore/feature.h index 30376c4..944ae8c 100755 --- a/src/C++/Explore/feature.h +++ b/src/C++/Explore/feature.h @@ -18,7 +18,7 @@ class FEATURE { private: vector Observations; // Vector of observations enabling direct access - vector Cutoffs; // Vector of cutoffs enabling direct access + // Vector of cutoffs enabling direct access vector LearnClasses; // List of pointers to class objects of observations ordered on value for learning vector ValidationClasses; // List of pointers to class objects of observations ordered on value for validation @@ -160,6 +160,7 @@ class FEATURE { string PrintCutoffMethod(); // Print cutoff method information string PrintOperatorMethod(); // Print operator method information + vector Cutoffs; }; #endif diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 606af18..c459957 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -641,14 +641,14 @@ Out: unsigned int, the minimum order of a cutoff Description: Returns the order of the minimal cutoff for a specific FeatureOperator currently used in the rule. **********************************************************************/ -unsigned int RULE::GetMinCutoff(unsigned int FOperator) { +unsigned int RULE::GetMinCutoff(unsigned int Fnum, int ConjunctionNr) { CONDITION* CurrentCondition; - unsigned int Result = FeatureOperators[FOperator].Cutoffs.size(); + unsigned int Result = Features[0][Fnum].Cutoffs.size(); - for (unsigned int i=0; i1; j++) { + for (unsigned int i=0; iFeatureOperator==FOperator) { + if (CurrentCondition->FeatureNumber==Fnum && (Conjunctions[i].Size>1 || FeatureOperators[CurrentCondition->FeatureOperator].RepeatedFeature)) { if (CurrentCondition->CutoffNumberCutoffNumber; } @@ -688,14 +688,14 @@ Out: unsigned int, the maximum order of a cutoff Description: Returns the order of the maximum cutoff for a specific FeatureOperator currently used in the rule. **********************************************************************/ -unsigned int RULE::GetMaxCutoff(unsigned int FOperator) { +unsigned int RULE::GetMaxCutoff(unsigned int Fnum) { CONDITION* CurrentCondition; unsigned int Result = 0; for (unsigned int i=0; i1; j++) { CurrentCondition = &Conjunctions[i].Conditions[j]; - if (CurrentCondition->FeatureOperator==FOperator) { + if (CurrentCondition->FeatureNumber==Fnum && (Conjunctions[i].Size>1 || FeatureOperators[CurrentCondition->FeatureOperator].RepeatedFeature)) { if (CurrentCondition->CutoffNumber>Result) { Result = CurrentCondition->CutoffNumber; } @@ -1670,9 +1670,6 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { Start = clock(); #endif - // int FOperatorNr_start = 1; - // int FOperatorNr_end = FOperatorNr_start+1; - // Counters as reference int ConjunctionSize, ConjunctionNr, ConditionNr, FOperatorNr, MaxFOperator; CONDITION* Condition; @@ -1728,7 +1725,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { for (unsigned int j=0; j0) { if (BinaryReduction && Conjunctions[ConjunctionNr].Size==1) { // Simply go to next FeatureOperator, no repeats + + if (FOperatorNr_start > 0 && Conjunctions[ConjunctionNr-1].Size!=1) { // For Parallel = TWO when starting with higher FOperatorNr + FOperatorNr=0; + NumRepeats=0; + } else if (FeatureOperators[Conjunctions[ConjunctionNr-1].Conditions[Conjunctions[ConjunctionNr-1].Size-1].FeatureOperator].Operator!=EQUAL + && Conjunctions[ConjunctionNr-1].Size!=1) { // Unless previous feature is continuous and not also term size 1, then repeat so "go back one" + FOperatorNr--; + } + } else if (Conjunctions[ConjunctionNr-1].Size>1) { FOperatorNr=0; - NumRepeats = 0; + NumRepeats=0; } else {//allow multiple occurences of nominal features if (FeatureOperators[Conjunctions[ConjunctionNr - 1].Conditions[0].FeatureOperator].Operator == EQUAL && !BinaryReduction) { @@ -1865,7 +1871,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { Condition = &Conjunctions[ConjunctionNr].Conditions[ConditionNr]; // Save reference to condition - if (Conjunctions[ConjunctionNr].Size>1 ) { + if (Conjunctions[ConjunctionNr].Size>1) { PreviousCondition = &FeatureOperators[Conjunctions[ConjunctionNr-1].Conditions[ConditionNr].FeatureOperator]; // AM: copy previous term NumRepeats = 0; } else { @@ -1877,7 +1883,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { for (i=0; i<=ConjunctionNr-1; i++) { // Go through all previous conjunctions (front of rule) PreviousConjunction = &Conjunctions[i]; for (unsigned int j = 0; j < PreviousConjunction->Conditions.size(); j++) { - if (FONext == PreviousConjunction->Conditions[j].FeatureOperator) { + if (FONext == PreviousConjunction->Conditions[j].FeatureOperator && FeatureOperators[ PreviousConjunction->Conditions[j].FeatureOperator].Operator==EQUAL) { FONext++; i=0; j=0; @@ -1919,7 +1925,6 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { } - // Not the conjunction at which we started, conjunction sizes do not match if (ConjunctionNr!=StartConjunctionNr && Conjunctions[ConjunctionNr].Size!=Conjunctions[ConjunctionNr-1].Size) { @@ -1933,7 +1938,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { for (unsigned int j=0; jCutoffs.size(); if (CurrentConjunction->Size==1 && Conjunctions.size()>1) { // More than one conjunction and current conjunction size = 1 - if (CurrentCondition-> Operator==EQUAL){ + if (CurrentCondition->Operator==EQUAL){ if (CurrentCondition->CutoffNumber+1 < MaxCutoff) { CurrentCondition->CutoffNumber++; Incremented = true; @@ -2120,7 +2125,7 @@ bool RULE::NextCutoffSet() { ConditionNr--; } } else if (CurrentCondition->Operator==LESS) { - MaxCutoff = GetMinCutoff(CurrentCondition->FeatureOperator); + MaxCutoff = GetMinCutoff(CurrentCondition->FeatureNumber, ConjunctionNr); if (CurrentCondition->CutoffNumber+1 < MaxCutoff) { CurrentCondition->CutoffNumber++; Incremented = true; @@ -2128,7 +2133,7 @@ bool RULE::NextCutoffSet() { ConditionNr--; } } else if (CurrentCondition-> Operator==GREATER){ - MaxCutoff = GetMaxCutoff(CurrentCondition->FeatureOperator); + MaxCutoff = GetMaxCutoff(CurrentCondition->FeatureNumber); if (CurrentCondition->CutoffNumber+1 > MaxCutoff && CurrentCondition->CutoffNumber+1Cutoffs.size()) { CurrentCondition->CutoffNumber++; Incremented = true; @@ -2137,10 +2142,12 @@ bool RULE::NextCutoffSet() { } } } else { - if (CurrentFeatureOperator->IsSolo && CurrentFeatureOperator->NonSoloIncluded && CurrentConjunction->Size>1) { // && !(CurrentFeatureOperator->Operator == LESS) - if (MaxCutoff == 2 || CurrentFeatureOperator->Operator==GREATER) { // MaxCutoff == 2 && Operator==EQUAL? - MaxCutoff--; // Needed for binary, should be removed for categorical - } + if (CurrentFeatureOperator->Operator==EQUAL && MaxCutoff==2){ // Needed for binary,should be removed for categorical + if (CurrentFeatureOperator->NonSoloIncluded) {MaxCutoff--;} + } else if (CurrentFeatureOperator->Operator==GREATER) { + if (CurrentFeatureOperator->NonSoloIncluded) {MaxCutoff--;} + } else if (CurrentFeatureOperator->Operator==LESS){ + if (CurrentFeatureOperator->RepeatedFeature) {MaxCutoff--;} } if (CurrentCondition->NextSame) { // For greater, also for equal or less? MaxCutoff--; @@ -2219,12 +2226,20 @@ bool RULE::NextCutoffSet() { } else if (CurrentCondition->Operator==LESS) { CurrentCondition->CutoffNumber = 0; } else if (CurrentCondition->Operator==GREATER) { + CurrentCondition->CutoffNumber = 0; + // Reset to next cutoff - CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureOperator)+1; + if (CurrentFeatureOperator->NonSoloIncluded || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->IsRepeated){ + CurrentCondition->CutoffNumber = GetMaxCutoff(CurrentCondition->FeatureNumber)+1; - // Or first if maximum reached - if (CurrentCondition->CutoffNumber > CurrentCondition->Cutoffs.size()-1) { - CurrentCondition->CutoffNumber = 0; + // Or first if maximum reached + if (CurrentCondition->CutoffNumber > CurrentCondition->Cutoffs.size()-1) { + if (!CurrentFeatureOperator->RepeatedFeature){ + CurrentCondition->CutoffNumber = 0; + } else { + CurrentCondition->CutoffNumber = 1; + } + } } } } else { @@ -2247,8 +2262,8 @@ bool RULE::NextCutoffSet() { } // Reset to next cutoff - if (!(CurrentCondition->Operator==GREATER) && CurrentFeatureOperator->IsSolo && CurrentConjunction->Size>1) { - CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureOperator)+1; + if ((CurrentFeatureOperator->NonSoloIncluded && !(CurrentCondition->Operator==GREATER)) || (CurrentFeatureOperator->RepeatedFeature && !(CurrentCondition->Operator==LESS))) { + CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber, (int)Conjunctions.size())+1; // TODO: check if correct } } } @@ -2271,7 +2286,9 @@ bool RULE::NextCutoffSet() { vector::iterator LFOperator(FeatureOperators.end()); for (; CFOperator != LFOperator; CFOperator++) { CFOperator->IsSolo = false; + CFOperator->IsRepeated = false; CFOperator->NonSoloIncluded = false; + CFOperator->RepeatedFeature = false; } // Reset cutoffs of Conditions in rule and find equal features within conjunctions @@ -2315,6 +2332,24 @@ bool RULE::NextCutoffSet() { for (ConjunctionNr = Conjunctions.size()-1; ConjunctionNr>=0; ConjunctionNr--) { if (Conjunctions[ConjunctionNr].Size==1) { FeatureOperators[Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator].IsSolo=true; + + // Identify occurences of that feature in other terms (of size 1 or more than 1) + for (int C=ConjunctionNr-1; C>=0; C--) { + CurrentConjunction = &Conjunctions[C]; + for (ConditionNr=0; ConditionNr<(int)CurrentConjunction->Size; ConditionNr++) { // Iterate through conditions + if (CurrentConjunction->Conditions[ConditionNr].FeatureNumber == + Conjunctions[ConjunctionNr].Conditions[0].FeatureNumber) { + FeatureOperators[Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator].IsRepeated=true; + + if (CurrentConjunction->Size > 1 && CurrentConjunction->Conditions[ConditionNr].FeatureOperator == + Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator) { + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].NonSoloIncluded = true; + } else { + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; + } + } + } + } } } @@ -2324,13 +2359,11 @@ bool RULE::NextCutoffSet() { CurrentCondition = &Conjunctions[ConjunctionNr].Conditions[ConditionNr]; CurrentFeatureOperator = &FeatureOperators[CurrentCondition->FeatureOperator]; - if (CurrentFeatureOperator->IsSolo==true) { // Is current condition solo? + if (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->NonSoloIncluded) { // Is current condition solo? if (CurrentConjunction->Size>1) { - CurrentFeatureOperator->NonSoloIncluded = true; - - if (!(CurrentCondition->Operator==GREATER)) { // If operator = less or equal, start from next cutoff + if ((CurrentFeatureOperator->NonSoloIncluded && !(CurrentCondition->Operator==GREATER)) || (CurrentFeatureOperator->RepeatedFeature && !(CurrentCondition->Operator==LESS))) { if (CurrentCondition->Cutoffs.size()>1) { - CurrentCondition->CutoffNumber = 1; + CurrentCondition->CutoffNumber = 1; // Then start from next cutoff } else { CutoffSetGenerated = false; return false; @@ -2340,9 +2373,10 @@ bool RULE::NextCutoffSet() { } else { CurrentCondition->CutoffNumber = 0; // TODO: unneccesary? - if (CurrentFeatureOperator->NonSoloIncluded && (CurrentCondition->Operator == GREATER)) { + if ((CurrentFeatureOperator->NonSoloIncluded || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->IsRepeated) && CurrentCondition->Operator == GREATER) { if (CurrentCondition->Cutoffs.size()>1) { - CurrentCondition-> CutoffNumber = 1; + CurrentCondition->CutoffNumber = GetMaxCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct, or should it be MinCutoff? + // CurrentCondition->CutoffNumber = 1; } else { CutoffSetGenerated = false; return false; @@ -2760,18 +2794,6 @@ void RULE::SetPrintCutoffSets(bool Setting) { IsPrintCutoffSets = Setting; } -/********************************************************************** -Function: SetPrintCutoffSets() -Category: Modifiers -Scope: public -In: bool, yes or no -Out: - -Description: Rule must cout conditionsets that it generates. -**********************************************************************/ -void RULE::SetPrintCutoffSetsBestLength(bool Setting) { - IsPrintCutoffSetsBestLength = Setting; -} - /********************************************************************** Function: SetTestMode() @@ -2794,9 +2816,6 @@ void RULE::SetTestMode(PARTITION_TYPE PType) { case VALIDATION: PartitionClasses = (*Features)[FeatureOperators[j].FeatureNumber].GetValidationClasses(); break; - case TRAIN: // both learn and validation set - PartitionClasses = (*Features)[FeatureOperators[j].FeatureNumber].GetTrainClasses(); - break; } FeatureOperators[j].InitialiseSets(PartitionClasses); } @@ -3427,11 +3446,9 @@ bool RULE::TestRule(bool Initialised, vector Constraints, float Cand CountCandidates++; if (CompareBestCandidate(CurrentPerformance, Initialised, CandidatePerformance, MaximizeMeasure)) { - // PartitionCandidates = SaveCandidate(CurrentPerformance, PartitionCandidates, MaximizeMeasure, RestrictionSet); Found = true; } } else { - // PartitionCandidates = SaveCandidate(CurrentPerformance, PartitionCandidates, MaximizeMeasure, RestrictionSet); Candidate = true; CountCandidates++; @@ -3442,7 +3459,10 @@ bool RULE::TestRule(bool Initialised, vector Constraints, float Cand switch (RuleOutputMethod) { case EVERY: - PrintCutoffSet(); + if (Candidate) { + cout << "Candidate model: "; + PrintCutoffSet(); + } if (IsPrintPerformance) { PrintPerformance(); } @@ -3467,8 +3487,7 @@ bool RULE::TestRule(bool Initialised, vector Constraints, float Cand End = clock(); ExploreTiming.AddTime("EXPLORE::TestRule", Start, End); #endif - // TODO: indicate when partition candidates NOT updated or return Candidate instead? - // return Candidate; + return Found; } @@ -3510,6 +3529,12 @@ bool RULE::CompareConstraints(PERFORMANCE CurrentPerformance, bool Initialised, break; case ACCURACY: RuleValue = CurrentPerformance.Accuracy.Value; + break; + case BALANCEDACCURACY: + RuleValue = CurrentPerformance.BalancedAccuracy.Value; + break; + case F1SCORE: + RuleValue = CurrentPerformance.F1score.Value; } ConstraintValue = (*CurrentConstraint).Value; if (ConstraintValue > RuleValue) { @@ -3579,7 +3604,7 @@ bool RULE::CompareBestCandidate(PERFORMANCE CurrentPerformance, bool Initialised break; } - if (CandidateValue<=RuleValue) { // TODO: why = included here? + if (CandidateValueRuleValue) { + if (CandidateValue>=RuleValue) { #ifdef DEBUG_TIMING End = clock(); ExploreTiming.AddTime("EXPLORE::CompareBestCandidate", Start, End); @@ -3649,167 +3674,6 @@ CANDIDATE RULE::SaveCandidate(PERFORMANCE_MEASURE MaximizeMeasure, bool Restrict #endif } -/********************************************************************** -Function: BestLength() -Category: Modifiers -Scope: public -In: - -Out: - -Description: is stop criterium met? -**********************************************************************/ -int RULE::FindBestLength(bool Initialised, CANDIDATE PartitionCandidates, PARTITION_METHOD PartitionMethod,PERFORMANCE_MEASURE MaximizeMeasure) { - float best; - float current; - int Opt=0; - - CANDIDATE BestCandidate; - - if (Initialised) { - SetTestMode(VALIDATION); - - if (PartitionCandidates.IsValid()) { - // if (PartitionCandidates.size()>0) { - for (unsigned int i=GetMinRuleLength(); i<=GetMaxRuleLength(); i++){ - BestCandidate = ChooseBestCandidate(i, Initialised, PartitionCandidates, MaximizeMeasure); - - if (BestCandidate.Performance.Accuracy.Value != 0) { // Check if BestCandidate not empty - if (SetRule(BestCandidate)) - { - cout << "RULELENGTH " << i << endl << endl; - cout << "Best candidate (within this partition): "; - PrintCutoffSet(); - cout << endl; - cout << "Learn-set: "; - BestCandidate.Performance.Print(); - cout << endl; - - if (!(PartitionMethod==RESUBSTITUTION)){ - BestCandidate.Performance = CalculatePerformance(); // Test BestCandidate on validation partition - cout << "Validation-set: "; - BestCandidate.Performance.Print(); - cout << endl; - } - switch (MaximizeMeasure){ - - case ACCURACY: - current = BestCandidate.Performance.Accuracy.Value; - break; - case SENSITIVITY: - current = BestCandidate.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - current = BestCandidate.Performance.Specificity.Value; - break; - case NPV: - current = BestCandidate.Performance.NPV.Value; - break; - case PPV: - current = BestCandidate.Performance.PPV.Value; - break; - case BALANCEDACCURACY: - current = BestCandidate.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - current = BestCandidate.Performance.F1score.Value; - break; - } - if (i==1) { - best = current; - Opt = 1; - } - else { - if (current > best) { - best = current; - Opt = i; - } - } - } - } - } - return Opt; - } else { -#if defined(EXPLORE_MPI_DEBUG) - cout << "--> No Candidates" << endl; -#endif - } - } - return 0; -} - - - -/********************************************************************** -Function: ChooseBestCandidate() -Category: Modifiers -Scope: public -In: insigned int, rule length -Out: - -Description: Retrieves the best candidate and puts it in -BestCandidate. -**********************************************************************/ -CANDIDATE RULE::ChooseBestCandidate(unsigned int RuleLength, bool Initialised, CANDIDATE PartitionCandidates, PERFORMANCE_MEASURE MaximizeMeasure) { -#ifdef DEBUG_TIMING - clock_t Start, End; - Start = clock(); -#endif -// bool Found = false; -// CANDIDATE BestCandidate; -// -// if (Initialised) { -// tbb::concurrent_vector::iterator CurrentCandidate(PartitionCandidates.begin()); -// tbb::concurrent_vector::iterator LastCandidate(PartitionCandidates.end()); -// -// // TODO: check if better place to create variable -// BestCandidate = (*CurrentCandidate); -// -// float CurrentValue; -// float BestValue; -// -// while (CurrentCandidate != LastCandidate) { -// CurrentValue = 0; -// if ((*CurrentCandidate).Features.size()==RuleLength){ -// switch (MaximizeMeasure) { -// case SENSITIVITY: -// CurrentValue = (*CurrentCandidate).Performance.Sensitivity.Value; -// break; -// case SPECIFICITY: -// CurrentValue = (*CurrentCandidate).Performance.Specificity.Value; -// break; -// case NPV: -// CurrentValue = (*CurrentCandidate).Performance.NPV.Value; -// break; -// case PPV: -// CurrentValue = (*CurrentCandidate).Performance.PPV.Value; -// break; -// case ACCURACY: -// CurrentValue = (*CurrentCandidate).Performance.Accuracy.Value; -// break; -// } -// -// if (BestValue<=CurrentValue) { -// BestCandidate = (*CurrentCandidate); -// BestValue = CurrentValue; -// Found = true; -// } -// } -// CurrentCandidate++; -// } -// } -// -//#ifdef DEBUG_TIMING -// End = clock(); -// ExploreTiming.AddTime("EXPLORE::ChooseBestCandidate", Start, End); -//#endif -// -// if (Found) { -// return BestCandidate; -// } else { -// return CANDIDATE(); -// } - - return PartitionCandidates; -} - /********************************************************************** Function: GetFeatureOperatorSize() Category: . diff --git a/src/C++/Explore/rule.h b/src/C++/Explore/rule.h index 84d7ff2..b2e2d63 100755 --- a/src/C++/Explore/rule.h +++ b/src/C++/Explore/rule.h @@ -86,7 +86,6 @@ class RULE { bool IsPrintCombinations{}; bool IsPrintFeatureSets{}; bool IsPrintCutoffSets{}; - bool IsPrintCutoffSetsBestLength{}; vector ROCCurves; @@ -148,7 +147,7 @@ unsigned int NoFeatureOperators{}; // vector GetOperators(); // Get a list of operators vector GetCutoffs(); // Get a list of cutoffs - unsigned int GetMinCutoff(unsigned int FOperator); + unsigned int GetMinCutoff(unsigned int FOperator, int ConjunctionNr); bool CutoffsAtMax(int ConjunctionNr, int ConditionNr); unsigned int GetMaxCutoff(unsigned int FOperator); @@ -176,7 +175,6 @@ unsigned int NoFeatureOperators{}; // void SetPrintCombinations(bool Setting); void SetPrintFeatureSets(bool Setting); void SetPrintCutoffSets(bool Setting); - void SetPrintCutoffSetsBestLength(bool Setting); void PrintSettings(); void PrintCombination(); // Print partition information of the rule diff --git a/src/C++/Explore/set.cpp b/src/C++/Explore/set.cpp index de7a7dc..ff88daa 100755 --- a/src/C++/Explore/set.cpp +++ b/src/C++/Explore/set.cpp @@ -425,14 +425,14 @@ string SET::PrintPerformance() { struct AndJibu : public std ::unary_function { - const boost::dynamic_bitset<> Source; - boost::dynamic_bitset<> Dest; - AndJibu(const boost::dynamic_bitset<> Source,boost::dynamic_bitset<> Dest) : - Source(Source), Dest(Dest){} - void operator()(int x) - { - Dest[x]=Dest[x] & Source[x]; - } + const boost::dynamic_bitset<> Source; + boost::dynamic_bitset<> Dest; + AndJibu(const boost::dynamic_bitset<> Source,boost::dynamic_bitset<> Dest) : + Source(Source), Dest(Dest){} + void operator()(int x) + { + Dest[x]=Dest[x] & Source[x]; + } }; /********************************************************************** diff --git a/src/C++/IOExplore/IOExplore.cpp b/src/C++/IOExplore/IOExplore.cpp index f69a756..d5d82af 100644 --- a/src/C++/IOExplore/IOExplore.cpp +++ b/src/C++/IOExplore/IOExplore.cpp @@ -62,8 +62,8 @@ IOExplore::IOExplore() { Dummy.push_back("Accuracy"); Dummy.push_back("PPV"); Dummy.push_back("NPV"); - Dummy.push_back("BalancedAccuracy"); - Dummy.push_back("F1score"); + Dummy.push_back("BalancedAccuracy"); + Dummy.push_back("F1score"); Sections.push_back(Dummy); Dummy.clear(); @@ -146,7 +146,6 @@ void IOExplore::ClearSettings() { ProjectSettings.PrintCombinations = false; ProjectSettings.PrintFeatureSets = false; ProjectSettings.PrintCutoffSets = false; - ProjectSettings.PrintCutoffSetsBestLength = false; ProjectSettings.PrintPerformance = false; ProjectSettings.PrintSets = false; ProjectSettings.BranchBound = false; @@ -586,13 +585,12 @@ bool IOExplore::SaveExploreToProject(string IOFilename) { case NPV: ProjectSettings.Maximize = NPV; break; - - case BALANCEDACCURACY: - ProjectSettings.Maximize = BALANCEDACCURACY; - break; - case F1SCORE: - ProjectSettings.Maximize = F1SCORE; - break; + case BALANCEDACCURACY: + ProjectSettings.Maximize = BALANCEDACCURACY; + break; + case F1SCORE: + ProjectSettings.Maximize = F1SCORE; + break; } vector Constraints = Project->GetConstraints(); @@ -625,16 +623,16 @@ bool IOExplore::SaveExploreToProject(string IOFilename) { ProjectSettings.Accuracy = (*CurrentConstraint).Value; } break; - case BALANCEDACCURACY: - if ((*CurrentConstraint).Value != 0){ - ProjectSettings.BalancedAccuracy = (*CurrentConstraint).Value; - } - break; - case F1SCORE: - if ((*CurrentConstraint).Value != 0){ - ProjectSettings.F1score = (*CurrentConstraint).Value; - } - break; + case BALANCEDACCURACY: + if ((*CurrentConstraint).Value != 0){ + ProjectSettings.BalancedAccuracy = (*CurrentConstraint).Value; + } + break; + case F1SCORE: + if ((*CurrentConstraint).Value != 0){ + ProjectSettings.F1score = (*CurrentConstraint).Value; + } + break; } } ProjectFile.flush(); @@ -692,10 +690,6 @@ bool IOExplore::SaveExploreToProject(string IOFilename) { if (Project->GetPrintCutoffSets()) { ProjectSettings.PrintCutoffSets = true; } - ProjectSettings.PrintCutoffSetsBestLength = false; - if (Project->GetPrintCutoffSetsBestLength()) { - ProjectSettings.PrintCutoffSetsBestLength = true; - } ProjectSettings.PrintPerformance = false; if (Project->GetPrintPerformance()) { ProjectSettings.PrintPerformance = true; @@ -870,12 +864,12 @@ bool IOExplore::SaveSettingsToFile(string IOFilename) { case NPV: ProjectFile << "Maximize=NPV" << endl; break; - case BALANCEDACCURACY: - ProjectFile << "Maximize=BALANCEDACCURACY" << endl; - break; - case F1SCORE: - ProjectFile << "Maximize=F1SCORE" << endl; - break; + case BALANCEDACCURACY: + ProjectFile << "Maximize=BALANCEDACCURACY" << endl; + break; + case F1SCORE: + ProjectFile << "Maximize=F1SCORE" << endl; + break; } if (ProjectSettings.Sensitivity>0) { ProjectFile << "Sensitivity=" << ProjectSettings.Sensitivity << endl; @@ -892,12 +886,12 @@ bool IOExplore::SaveSettingsToFile(string IOFilename) { if (ProjectSettings.Accuracy>0) { ProjectFile << "Accuracy=" << ProjectSettings.Accuracy << endl; } - if (ProjectSettings.BalancedAccuracy>0) { - ProjectFile << "BalancedAccuracy=" << ProjectSettings.BalancedAccuracy << endl; - } - if (ProjectSettings.F1score>0) { - ProjectFile << "F1score=" << ProjectSettings.F1score << endl; - } + if (ProjectSettings.BalancedAccuracy>0) { + ProjectFile << "BalancedAccuracy=" << ProjectSettings.BalancedAccuracy << endl; + } + if (ProjectSettings.F1score>0) { + ProjectFile << "F1score=" << ProjectSettings.F1score << endl; + } ProjectFile << "[Output]" << endl; switch (ProjectSettings.OutputMethod) { case EVERY: @@ -1367,23 +1361,22 @@ bool IOExplore::SetupExploreFromProject(string IOFilename) { return false; } } - if (CurrentHeading.compare("BalancedAccuracy")==0) { // Balanced accuracy constraint - if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { - ProjectSettings.BalancedAccuracy = atof(CurrentValue.c_str()); - } else { - ProjectLoadErrors.push_back("Invalid value for constraint NPV."); - return false; - } + if (CurrentHeading.compare("BalancedAccuracy")==0) { // Balanced accuracy constraint + if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { + ProjectSettings.BalancedAccuracy = atof(CurrentValue.c_str()); + } else { + ProjectLoadErrors.push_back("Invalid value for constraint Balanced Accuracy."); + return false; } - - if (CurrentHeading.compare("F1score")==0) { // F1 score constraint - if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { - ProjectSettings.F1score = atof(CurrentValue.c_str()); - } else { - ProjectLoadErrors.push_back("Invalid value for constraint NPV."); - return false; - } + } + if (CurrentHeading.compare("F1score")==0) { // F1 score constraint + if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { + ProjectSettings.F1score = atof(CurrentValue.c_str()); + } else { + ProjectLoadErrors.push_back("Invalid value for constraint F1score."); + return false; } + } // Output Settings if (CurrentHeading.compare("OutputMethod")==0) { // Output method (ALL, INCREMENTAL or BEST) if (CurrentValue.compare("EVERY")==0) { @@ -1498,16 +1491,6 @@ bool IOExplore::SetupExploreFromProject(string IOFilename) { } } - if (CurrentHeading.compare("PrintCutoffSetsBestLength")==0) { // Print cutoffsets - if (CurrentValue.compare("yes")==0) { - ProjectSettings.PrintCutoffSetsBestLength = true; - } else if (CurrentValue.compare("no")==0) { - ProjectSettings.PrintCutoffSetsBestLength = false; - } else { - ProjectLoadErrors.push_back("Invalid value for print cutoffsets bestlength."); - return false; - } - } if (CurrentHeading.compare("PrintPerformance")==0) { // Print performance if (CurrentValue.compare("yes")==0) { ProjectSettings.PrintPerformance = true; @@ -1718,7 +1701,6 @@ bool IOExplore::SetupExploreFromStruct() { Project->SetPrintCombinations(ProjectSettings.PrintCombinations); Project->SetPrintFeatureSets(ProjectSettings.PrintFeatureSets); Project->SetPrintCutoffSets(ProjectSettings.PrintCutoffSets); - Project->SetPrintCutoffSetsBestLength(ProjectSettings.PrintCutoffSetsBestLength); Project->SetPrintPerformance(ProjectSettings.PrintPerformance); Project->SetPrintSets(ProjectSettings.PrintSets); Project->SetSavePartitions(ProjectSettings.SavePartitions); diff --git a/src/C++/IOExplore/IOExplore.h b/src/C++/IOExplore/IOExplore.h index bf1b25e..f1af394 100755 --- a/src/C++/IOExplore/IOExplore.h +++ b/src/C++/IOExplore/IOExplore.h @@ -64,7 +64,6 @@ struct ExploreSettings { bool PrintCombinations; bool PrintFeatureSets; bool PrintCutoffSets; - bool PrintCutoffSetsBestLength; bool PrintPerformance; bool PrintSets; bool SavePartitions; diff --git a/tests/testthat/test-HelperFunctions.R b/tests/testthat/test-HelperFunctions.R new file mode 100644 index 0000000..3d27d8d --- /dev/null +++ b/tests/testthat/test-HelperFunctions.R @@ -0,0 +1,13 @@ +test_that("Convert logical to 0/1", { + train_data <- data.frame(check.names = FALSE, + outcomeCount = c(FALSE,FALSE,FALSE, + FALSE,FALSE,TRUE), + `198124209` = c(FALSE,FALSE,FALSE, + FALSE,FALSE,TRUE), + `316139209` = c(FALSE,FALSE,FALSE, + FALSE,FALSE,FALSE), + `316139210` = c(FALSE,FALSE,FALSE, + FALSE,FALSE,FALSE)) + train_data <- convert_logical(train_data) + expect_true(all(sapply(train_data, function(col) all(col %in% c(0, 1))))) +}) diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index 8dc85a2..98f3ddf 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -110,3 +110,93 @@ test_that("compute AUC", { expect_true(auroc < 100) expect_true(auroc > 0) }) + +test_that("mandatory features", { + ### Tests for EXPLORE using iris dataset + data_path <- system.file("examples", "tests", "iris.arff", package = "Explore") + settings_path <- system.file("examples", "tests", "iris.project", package = "Explore") + output_path <- paste0(tempdir(), "/", "Test1") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + output_path <- paste0(output_path, "/") + data <- farff::readARFF(data_path) + model <- Explore::trainExplore(output_path = output_path, + file_name = "iris", + train_data = data, + ClassFeature = "'class'", + PositiveClass = '"Iris-versicolor"', + FeatureInclude = "'sepalwidth';'sepallength'") + expect_equal(class(model), "character") + # expect_true(is.na(model), info = "Test failed because model is NA") + expect_equal(model, "'sepallength'>4.9AND'sepalwidth'<=3.2AND'petalwidth'<=1.7") +}) + +test_that("balanced accuracy constraint ", { + data_path <- system.file("examples", "complexity", "mix_4.arff", package = "Explore") + output_path <- paste0(getwd(), "/", "Test1") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + output_path <- paste0(output_path, "/") + + data <- farff::readARFF(data_path) + data <-as.data.frame(apply(data,2,as.numeric)) + + model_without <- Explore::trainExplore(output_path = output_path, + file_name = "mix_4", + train_data = data, + StartRulelength = 3, + ClassFeature = "'outcomeCount'", + PositiveClass = '"1"') + num_without <- Explore::candidatesExplore(paste0(output_path, "mix_4", ".result")) + + model_with <- Explore::trainExplore(output_path = output_path, + file_name = "mix_4", + train_data = data, + StartRulelength = 3, + ClassFeature = "'outcomeCount'", + PositiveClass = '"1"', + BalancedAccuracy = 0.6, + OutputMethod = "EVERY", + Parallel = FALSE) + num_with <- Explore::candidatesExplore(paste0(output_path, "mix_4", ".result")) + + expect_equal(num_without, 1940) + expect_equal(num_with, 36) +}) + +test_that("Results Explore", { + + dataset <- "binary_3" + config <- getDataSetPath(dataset = dataset) + train_data <- farff::readARFF(config$data_path) + output_path <- paste0(tempdir(), "/", glue::glue("{getRandomId()}"), "/") + file_name <- paste0(dataset, "_train_data") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + + result <- trainExplore(train_data = train_data, + settings_path = NULL, + output_path = output_path, + file_name = file_name, + StartRulelength = 1, + EndRulelength = 2, + CutoffMethod = "RVAC", + ClassFeature = "'outcomeCount'", + PositiveClass = "\"1\"", + Maximize = "ACCURACY", + PrintPerformance = TRUE, + Subsumption = TRUE) + + + outputFile <- paste0(output_path, file_name, ".result") + results_list <- resultsExplore(outputFile = outputFile) + expect_equal(results_list$total_count_cutoff_sets, "16") + expect_length(results_list$candidate_model, 32) + +}) diff --git a/tests/testthat/test-runExplore.R b/tests/testthat/test-runExplore.R index ce27014..a5f55f4 100644 --- a/tests/testthat/test-runExplore.R +++ b/tests/testthat/test-runExplore.R @@ -108,12 +108,11 @@ test_that("getDataSetPath list cases", { dataset <- "iris" data_path <- getDataSetPath(dataset = dataset) - expect_equal(data_path$ClassFeature, "'class'") - + expect_equal(data_path$class_feature, "'class'") dataset <- "binary_3" data_path <- getDataSetPath(dataset = dataset) - expect_equal(data_path$ClassFeature, "'outcomeCount'") + expect_equal(data_path$class_feature, "'outcomeCount'") dataset <- "mix_4_ordered" expect_error(getDataSetPath(dataset = dataset)) diff --git a/tests/testthat/test-testExplore.R b/tests/testthat/test-testExplore.R index cb0dd91..69dc096 100644 --- a/tests/testthat/test-testExplore.R +++ b/tests/testthat/test-testExplore.R @@ -1,3 +1,48 @@ +test_that("binary_3 trainExplore resultsExplore", { + + dataset <- "binary_3" + config <- getDataSetPath(dataset = dataset) + train_data <- farff::readARFF(config$data_path) + output_path <- paste0(tempdir(), "/", glue::glue("{getRandomId()}"), "/") + file_name <- paste0(dataset, "_train_data") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + + result <- trainExplore(train_data = train_data, + settings_path = NULL, + output_path = output_path, + file_name = file_name, + OutputFile = NULL, + StartRulelength = 1, + EndRulelength = 1, + OperatorMethod = "MEDIAN", + CutoffMethod = "RVAC", + ClassFeature = config$class_feature, + PositiveClass = config$positive_class, + FeatureInclude = "", + Maximize = "ACCURACY", + Accuracy = 0, + BalancedAccuracy = 0, + Specificity = 0, + PrintSettings = TRUE, + PrintPerformance = TRUE, + Subsumption = TRUE, + BranchBound = TRUE, + Parallel = FALSE, + PrintCutoffSets = TRUE, + Sorted = "none", + OutputMethod = "EVERY", + BinaryReduction = FALSE) + + outputFile <- paste0(output_path, file_name, ".result") + results_list <- resultsExplore(outputFile = outputFile) + expect_length(results_list$candidate_model, 6) + unlink(output_path, recursive = TRUE) + +}) + test_that("Test binary_3", { # Binary reduction FALSE