Merge pull request #27 from mi-erasmusmc/unary_function_change

Unary function change
mi-erasmusmc · Oct 1, 2024 · a3a1e0b · a3a1e0b
2 parents 1e176cd + 2cd294b
commit a3a1e0b
Show file tree

Hide file tree

Showing 34 changed files with 867 additions and 803 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(candidateNumberExplore)
 export(modelsCurveExplore)
 export(predictExplore)
 export(rocCurveExplore)

diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
@@ -78,8 +78,13 @@ saveData <- function(output_path, train_data, file_name) {
 
   # Fix col type for binary data
   binary_cols <- sapply(1:ncol(train_data), function(c) all(train_data[[c]] %in% 0:1))
-  train_data[binary_cols] <- lapply(colnames(train_data[binary_cols]), function(c) factor(train_data[[c]], labels=c(0,1)))
+
+  # Convert TRUE/FALSE to 1/0
+  train_data <- convert_logical(train_data)
 
+  # Order data (first binary then continuous features)
+  train_data <- cbind(train_data[binary_cols],train_data[!binary_cols]) # Order needed for correct functioning of main algorithm in C++ 
+
   # Save data as arff file
   if (file.exists(paste0(output_path, file_name, ".arff"))) {file.remove(paste0(output_path, file_name, ".arff"))}
   farff::writeARFF(train_data, paste0(output_path, file_name, ".arff"))
@@ -97,9 +102,43 @@ saveData <- function(output_path, train_data, file_name) {
   # TODO: Support other file formats?
 }
 
+convert_logical <- function(train_data) {
+
+  binary_cols <- sapply(train_data, function(col) all(col %in% c(0, 1, TRUE, FALSE)))
+
+  # Convert TRUE/FALSE to 1/0 and create factors
+  train_data[binary_cols] <- lapply(train_data[binary_cols], function(col) {
+    col <- as.numeric(as.logical(col))  # Convert TRUE/FALSE to 1/0
+    factor(col, levels = c(0, 1), labels = c(0, 1))  # Convert to factors
+  })
+
+  return(train_data)
+
+}
+
 # Correlation metric for binary data.
 jaccard <- function(a, b) {
   intersection = length(intersect(a, b))
   union = length(a) + length(b) - intersection
   return (intersection/union)
-}
+}
+
+phi <- function(a, b) {
+  contingency_tb <- table(a, b)
+
+  r.sum <- rowSums(contingency_tb)
+  c.sum <- colSums(contingency_tb)
+
+  total <- sum(r.sum)
+  r.sum <- r.sum/total
+  c.sum <- c.sum/total
+
+  v <- prod(r.sum, c.sum)
+  phi <- (contingency_tb[1,1] / total - c.sum[1] * r.sum[1] / sqrt(v))
+  names(phi) <- NULL
+
+  return(phi)
+}
+
+
+
diff --git a/R/MainFunctions.R b/R/MainFunctions.R
@@ -39,17 +39,17 @@ trainExplore <- function(train_data = NULL,
                          StartRulelength = 1,
                          EndRulelength = 3,
                          OperatorMethod = "EXHAUSTIVE",
-                         CutoffMethod = "RVAC",
+                         CutoffMethod = "ALL",
                          ClassFeature = "'class'",
                          PositiveClass = "'Iris-versicolor'",
                          FeatureInclude = "",
-                         Maximize = "ACCURACY",
+                         Maximize = "BALANCEDACCURACY",
                          Accuracy = 0,
                          BalancedAccuracy = 0,
                          Specificity = 0,
                          PrintSettings = TRUE,
-                         PrintPerformance = TRUE,
-                         Subsumption = TRUE,
+                         PrintPerformance = FALSE,
+                         Subsumption = FALSE,
                          BranchBound = TRUE,
                          Parallel = FALSE,
                          PrintCutoffSets = TRUE,
@@ -101,6 +101,7 @@ trainExplore <- function(train_data = NULL,
                     checkDouble(Accuracy),
                     checkDouble(BalancedAccuracy),
                     checkDouble(Specificity),
+                    checkString(OutputMethod),
                     checkLogical(PrintSettings),
                     checkLogical(PrintPerformance),
                     checkLogical(Subsumption),
@@ -121,6 +122,7 @@ trainExplore <- function(train_data = NULL,
   Subsumption <- ifelse(Subsumption == TRUE, "yes", "no")
   BranchBound <- ifelse(BranchBound == TRUE, "yes", "no")
   Parallel <- ifelse(Parallel == TRUE, "yes", "no")
+  BinaryReduction <- ifelse(BinaryReduction == TRUE, "yes", "no")
   Accuracy <- ifelse(Accuracy == 0, "", Accuracy)
   BalancedAccuracy <- ifelse(BalancedAccuracy == 0, "", BalancedAccuracy)
   Specificity <- ifelse(Specificity == 0, "", Specificity)
@@ -155,7 +157,9 @@ trainExplore <- function(train_data = NULL,
         cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) cor(col, train_data[ClassFeature_]==PositiveClass_, method=Sorted))
       } else if (Sorted == "jaccard") {
         cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) jaccard(col, train_data[ClassFeature_]==PositiveClass_))
-      }
+      } else if (Sorted == "phi") { 
+        cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) phi(col, train_data[ClassFeature_]==PositiveClass_))
+      } 
       # else if (Sorted == "LASSO") { 
       #   model_lasso <- glmnet::cv.glmnet(x=data.matrix(train_data[, -which(names(train_data) == ClassFeature_)]), y = train_data[ClassFeature_]==PositiveClass_, alpha = 1, lambda = 10^seq(3, -2, by = -.1), maxit=10000000, standardize = TRUE, nfolds = 5, family = "binomial")
       #   coef <- as.matrix(coef(model_lasso, s = "lambda.min")) # get importance
@@ -164,7 +168,7 @@ trainExplore <- function(train_data = NULL,
       # }
 
       coef <- names(cor)[order(-abs(cor))] 
-      train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by LASSO importance
+      train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by importance
     }
 
     saveData(output_path, train_data, file_name)
@@ -212,17 +216,18 @@ trainExplore <- function(train_data = NULL,
   #                "cutoff_sets" = cutoff_sets)
 
   # Load model
-  rule_string <- stringr::str_extract(results, "Best candidate \\(overall\\):.*?\u000A")
+  rule_string <- stringr::str_extract_all(results, "Best candidate:.*?\u000A")
+  rule_string <- unlist(rule_string)[[length(rule_string)]] # Select the last rule as this is the final candidate
 
   # Clean string
-  rule_string <- stringr::str_replace(rule_string, "Best candidate \\(overall\\):", "")
+  rule_string <- stringr::str_replace(rule_string, "Best candidate:", "")
   rule_string <- stringr::str_replace_all(rule_string, " ", "")
   rule_string <- stringr::str_replace_all(rule_string, "\\n", "")
 
 
   results <- list("model" = rule_string,
-                 "candidate_models" = candidate_models,
-                 "cutoff_sets" = cutoff_sets)
+                  "candidate_models" = candidate_models,
+                  "cutoff_sets" = cutoff_sets)
 
   result <- results[resultType]
 
@@ -249,6 +254,7 @@ trainExplore <- function(train_data = NULL,
 #' @param Maximize One of list with strings, list = "ACCURACY", ...
 #' @param Accuracy Float 0-1 -> default = 0 (if 0, make empty = computationally more beneficial)
 #' @param Specificity  float 0-1, default = 0
+#' @param OutputMethod string EVERY, BEST, INCREMENT
 #' @param PrintSettings True or False
 #' @param PrintPerformance True or False
 #' @param Subsumption True or False
@@ -269,17 +275,18 @@ settingsExplore <- function(settings,
                             ClassFeature,
                             PositiveClass,
                             FeatureInclude = "",
-                            Maximize = "ACCURACY",
+                            Maximize = "BALANCEDACCURACY",
                             Accuracy = 0,
                             BalancedAccuracy = 0,
                             Specificity = 0,
+                            OutputMethod = "BEST",
                             PrintSettings = "yes",
-                            PrintPerformance = "yes",
-                            PrintCutoffSets = "yes",
-                            Subsumption = "yes",
+                            PrintPerformance = "no",
+                            PrintCutoffSets = "no",
+                            Subsumption = "no",
                             BranchBound = "yes",
                             Parallel = "no",
-                            OutputMethod = "EVERY",
+                            ParallelMethod = "TWO",
                             BinaryReduction = "no") {
 
 
@@ -308,6 +315,7 @@ settingsExplore <- function(settings,
   settings <- changeSetting(settings, parameter = "Subsumption", input = Subsumption)
   settings <- changeSetting(settings, parameter = "BranchBound", input = BranchBound)
   settings <- changeSetting(settings, parameter = "Parallel", input = Parallel)
+  settings <- changeSetting(settings, parameter = "ParallelMethod", input = ParallelMethod)
   settings <- changeSetting(settings, parameter = "OutputMethod", input = OutputMethod)
   settings <- changeSetting(settings, parameter = "BinaryReduction", input = BinaryReduction)
 
@@ -339,6 +347,11 @@ predictExplore <- function(model, test_data) {
     return(NULL)
   }
 
+  # Clean string
+  model <- stringr::str_remove_all(model, '\"')
+  model <- stringr::str_replace_all(model, "=", "==") 
+  model <- stringr::str_replace_all(model, "<=", "<") # to correct initial case <= -> <== -> <= 
+
   # Split string 
   all_terms <- stringr::str_split_fixed(model, "OR", n=Inf)
 
@@ -360,12 +373,59 @@ predictExplore <- function(model, test_data) {
     data_model <- cbind(data_model, as.integer(col==length(all_literals)))
   }
 
-  colnames(data_model) <- all_terms
+  colnames(data_model) <- all_terms # TODO: CHECK HERE WHY DATA_MODEL NO COLUMNS
   predictions <- as.integer(rowSums(data_model)>0)
 
   return(predictions)
 }
 
+#' Return a set of results from EXPLORE output file
+#' @param outputFile outputfile = paste0(output_path, file_name, ".result")
+#'
+#' @export
+resultsExplore <- function(outputFile) {
+
+  # Read in results file
+  results <- paste(readLines(outputFile), collapse="\n")
+  results_lines <- strsplit(results, "\n") %>% unlist()
+
+  result <- list()
+
+  for (line in results_lines) {
+    # line <- "Candidate model: '198124209' = \"0\"" 
+    if (grepl(":", line)) {
+      if (grepl("Candidate model", line)) {
+        split_line <- strsplit(line, ":")[[1]]
+        key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .) 
+        value <- trimws(split_line[2]) 
+        result[[key]] <- c(result[[key]], value)
+      } else {
+        split_line <- strsplit(line, ":")[[1]]
+        key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .) 
+        value <- trimws(split_line[2])
+        result[[key]] <- value
+      }
+    }
+  }
+
+  return(result)
+}
+
+#' Return the number of candidate rules for EXPLORE
+#' @param OutputFile output file = paste0(output_path, file_name, ".result")
+#'
+#' @export
+candidateNumberExplore <- function(OutputFile) {
+
+  # Read in results file
+  results <- paste(readLines(OutputFile), collapse="\n")
+
+  num_candidates <- stringr::str_extract_all(results, "Total Count Candidates \\(incl constraints\\):.*?\u000A")[[1]]
+  num_candidates <- as.data.frame(stringr::str_remove_all(num_candidates, "Total Count Candidates \\(incl constraints\\):"))
+  num_candidates <- stringr::str_replace_all(num_candidates, "\\n", "")
+
+  return(as.numeric(num_candidates))
+}
 
 #' modelsCurveExplore # TODO: update documentation?
 #'
@@ -386,19 +446,23 @@ modelsCurveExplore <- function(train_data = NULL,
                                StartRulelength = 1,
                                EndRulelength = 3,
                                OperatorMethod = "EXHAUSTIVE",
-                               CutoffMethod = "RVAC",
+                               CutoffMethod = "ALL",
                                ClassFeature = "'class'",
                                PositiveClass = "'Iris-versicolor'",
                                FeatureInclude = "",
-                               Maximize = "ACCURACY",
+                               Maximize = "BALANCEDACCURACY",
                                Accuracy = 0,
                                BalancedAccuracy = 0,
                                Specificity = 0,
+                               OutputMethod = "BEST", 
                                PrintSettings = TRUE,
-                               PrintPerformance = TRUE,
-                               Subsumption = TRUE,
+                               PrintPerformance = FALSE,
+                               Subsumption = FALSE,
                                BranchBound = TRUE,
-                               Parallel = FALSE) {
+                               Sorted = "none",
+                               Parallel = TRUE,
+                               ParallelMethod = "TWO",
+                               BinaryReduction = FALSE) {
   # TODO: only input required variables?
 
   # Range of specificities to check
@@ -418,9 +482,9 @@ modelsCurveExplore <- function(train_data = NULL,
                                      ClassFeature = ClassFeature, PositiveClass = PositiveClass,
                                      FeatureInclude = FeatureInclude, Maximize = "SENSITIVITY",
                                      Accuracy = Accuracy, BalancedAccuracy = BalancedAccuracy, Specificity = constraint,
-                                     PrintSettings = PrintSettings, PrintPerformance = PrintPerformance,
+                                     OutputMethod = OutputMethod, PrintSettings = PrintSettings, PrintPerformance = PrintPerformance,
                                      Subsumption = Subsumption, BranchBound = BranchBound,
-                                     Parallel = Parallel)
+                                     Parallel = Parallel, ParallelMethod = ParallelMethod)
 
       return(model)
     })

diff --git a/R/testExplore.R b/R/testExplore.R
@@ -1,4 +1,7 @@
-testExplore <- function(dataset = "iris", StartRulelength = 2, EndRulelength = 2, BinaryReduction = FALSE) {
+testExplore <- function(dataset = "iris", 
+                        StartRulelength = 2, 
+                        EndRulelength = 2, 
+                        BinaryReduction = FALSE) {
   # dataset = "iris"
   # dataset = "binary_3"
   # dataset = "binary_10"

diff --git a/inst/CMakeLists.txt b/inst/CMakeLists.txt
@@ -38,9 +38,9 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ltbb")
 # set(CMAKE_PREFIX_PATH "/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8")
 # find_library(TBB_LIB tbb)
 # find_path(TBB_PATH
- #       HINTS /opt/intel/oneapi/tbb/latest/include
- #       NAMES tbb/parallel_for.h)
- 
+#       HINTS /opt/intel/oneapi/tbb/latest/include
+#       NAMES tbb/parallel_for.h)
+
 set(SOURCE_FILES
         Clion/main.cpp
         ../src/C++/CMExplore/cmdline.h
@@ -93,8 +93,7 @@ set(SOURCE_FILES
         ../src/C++/IOExplore/IOExplore.h
         ../src/C++/common.cpp
         ../src/C++/common.h
-        ../src/C++/stl.h
-        ../src/C++/stlpmt.lib)
+        ../src/C++/stl.h)
 
 add_executable(Explore ${SOURCE_FILES})
 

diff --git a/inst/examples/complexity/binary_10.project b/inst/examples/complexity/binary_10.project
@@ -8,7 +8,7 @@ IncrementalOutputFile=false
 [Setup]
 PartitionMethod=RESUBSTITUTION
 Randomize=no
-StartRulelength=3
+StartRulelength=1
 EndRulelength=3
 LearnRatio=0.8
 NumberofPartitions=1
@@ -34,17 +34,16 @@ PrintCutoffMethod=no
 PrintCutoffValues=no
 PrintOperatorMethod=no
 PrintOperatorValues=no
-PrintCombinations=yes
-PrintFeatureSets=yes
+PrintCombinations=no
+PrintFeatureSets=no
 PrintCutoffSets=no
-PrintCutOffsetsBestLength=no
 PrintPerformance=yes
 PrintSets=no
 SavePartitions=no
 [Run]
 Subsumption=no
 BranchBound=no
 Parallel=no
-ParallelMethod=ONE
+ParallelMethod=TWO
 BinaryReduction=no
 
diff --git a/inst/examples/complexity/binary_3.project b/inst/examples/complexity/binary_3.project
@@ -34,17 +34,16 @@ PrintCutoffMethod=no
 PrintCutoffValues=no
 PrintOperatorMethod=no
 PrintOperatorValues=no
-PrintCombinations=yes
-PrintFeatureSets=yes
+PrintCombinations=no
+PrintFeatureSets=no
 PrintCutoffSets=no
-PrintCutOffsetsBestLength=no
 PrintPerformance=yes
 PrintSets=no
 SavePartitions=no
 [Run]
 Subsumption=no
 BranchBound=no
 Parallel=no
-ParallelMethod=ONE
+ParallelMethod=TWO
 BinaryReduction=no