Skip to content

Commit

Permalink
Merge pull request #27 from mi-erasmusmc/unary_function_change
Browse files Browse the repository at this point in the history
Unary function change
  • Loading branch information
cebarboza authored Oct 1, 2024
2 parents 1e176cd + 2cd294b commit a3a1e0b
Show file tree
Hide file tree
Showing 34 changed files with 867 additions and 803 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(candidateNumberExplore)
export(modelsCurveExplore)
export(predictExplore)
export(rocCurveExplore)
Expand Down
43 changes: 41 additions & 2 deletions R/HelperFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,13 @@ saveData <- function(output_path, train_data, file_name) {

# Fix col type for binary data
binary_cols <- sapply(1:ncol(train_data), function(c) all(train_data[[c]] %in% 0:1))
train_data[binary_cols] <- lapply(colnames(train_data[binary_cols]), function(c) factor(train_data[[c]], labels=c(0,1)))

# Convert TRUE/FALSE to 1/0
train_data <- convert_logical(train_data)

# Order data (first binary then continuous features)
train_data <- cbind(train_data[binary_cols],train_data[!binary_cols]) # Order needed for correct functioning of main algorithm in C++

# Save data as arff file
if (file.exists(paste0(output_path, file_name, ".arff"))) {file.remove(paste0(output_path, file_name, ".arff"))}
farff::writeARFF(train_data, paste0(output_path, file_name, ".arff"))
Expand All @@ -97,9 +102,43 @@ saveData <- function(output_path, train_data, file_name) {
# TODO: Support other file formats?
}

convert_logical <- function(train_data) {

binary_cols <- sapply(train_data, function(col) all(col %in% c(0, 1, TRUE, FALSE)))

# Convert TRUE/FALSE to 1/0 and create factors
train_data[binary_cols] <- lapply(train_data[binary_cols], function(col) {
col <- as.numeric(as.logical(col)) # Convert TRUE/FALSE to 1/0
factor(col, levels = c(0, 1), labels = c(0, 1)) # Convert to factors
})

return(train_data)

}

# Correlation metric for binary data.
jaccard <- function(a, b) {
intersection = length(intersect(a, b))
union = length(a) + length(b) - intersection
return (intersection/union)
}
}

phi <- function(a, b) {
contingency_tb <- table(a, b)

r.sum <- rowSums(contingency_tb)
c.sum <- colSums(contingency_tb)

total <- sum(r.sum)
r.sum <- r.sum/total
c.sum <- c.sum/total

v <- prod(r.sum, c.sum)
phi <- (contingency_tb[1,1] / total - c.sum[1] * r.sum[1] / sqrt(v))
names(phi) <- NULL

return(phi)
}



110 changes: 87 additions & 23 deletions R/MainFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,17 @@ trainExplore <- function(train_data = NULL,
StartRulelength = 1,
EndRulelength = 3,
OperatorMethod = "EXHAUSTIVE",
CutoffMethod = "RVAC",
CutoffMethod = "ALL",
ClassFeature = "'class'",
PositiveClass = "'Iris-versicolor'",
FeatureInclude = "",
Maximize = "ACCURACY",
Maximize = "BALANCEDACCURACY",
Accuracy = 0,
BalancedAccuracy = 0,
Specificity = 0,
PrintSettings = TRUE,
PrintPerformance = TRUE,
Subsumption = TRUE,
PrintPerformance = FALSE,
Subsumption = FALSE,
BranchBound = TRUE,
Parallel = FALSE,
PrintCutoffSets = TRUE,
Expand Down Expand Up @@ -101,6 +101,7 @@ trainExplore <- function(train_data = NULL,
checkDouble(Accuracy),
checkDouble(BalancedAccuracy),
checkDouble(Specificity),
checkString(OutputMethod),
checkLogical(PrintSettings),
checkLogical(PrintPerformance),
checkLogical(Subsumption),
Expand All @@ -121,6 +122,7 @@ trainExplore <- function(train_data = NULL,
Subsumption <- ifelse(Subsumption == TRUE, "yes", "no")
BranchBound <- ifelse(BranchBound == TRUE, "yes", "no")
Parallel <- ifelse(Parallel == TRUE, "yes", "no")
BinaryReduction <- ifelse(BinaryReduction == TRUE, "yes", "no")
Accuracy <- ifelse(Accuracy == 0, "", Accuracy)
BalancedAccuracy <- ifelse(BalancedAccuracy == 0, "", BalancedAccuracy)
Specificity <- ifelse(Specificity == 0, "", Specificity)
Expand Down Expand Up @@ -155,7 +157,9 @@ trainExplore <- function(train_data = NULL,
cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) cor(col, train_data[ClassFeature_]==PositiveClass_, method=Sorted))
} else if (Sorted == "jaccard") {
cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) jaccard(col, train_data[ClassFeature_]==PositiveClass_))
}
} else if (Sorted == "phi") {
cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) phi(col, train_data[ClassFeature_]==PositiveClass_))
}
# else if (Sorted == "LASSO") {
# model_lasso <- glmnet::cv.glmnet(x=data.matrix(train_data[, -which(names(train_data) == ClassFeature_)]), y = train_data[ClassFeature_]==PositiveClass_, alpha = 1, lambda = 10^seq(3, -2, by = -.1), maxit=10000000, standardize = TRUE, nfolds = 5, family = "binomial")
# coef <- as.matrix(coef(model_lasso, s = "lambda.min")) # get importance
Expand All @@ -164,7 +168,7 @@ trainExplore <- function(train_data = NULL,
# }

coef <- names(cor)[order(-abs(cor))]
train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by LASSO importance
train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by importance
}

saveData(output_path, train_data, file_name)
Expand Down Expand Up @@ -212,17 +216,18 @@ trainExplore <- function(train_data = NULL,
# "cutoff_sets" = cutoff_sets)

# Load model
rule_string <- stringr::str_extract(results, "Best candidate \\(overall\\):.*?\u000A")
rule_string <- stringr::str_extract_all(results, "Best candidate:.*?\u000A")
rule_string <- unlist(rule_string)[[length(rule_string)]] # Select the last rule as this is the final candidate

# Clean string
rule_string <- stringr::str_replace(rule_string, "Best candidate \\(overall\\):", "")
rule_string <- stringr::str_replace(rule_string, "Best candidate:", "")
rule_string <- stringr::str_replace_all(rule_string, " ", "")
rule_string <- stringr::str_replace_all(rule_string, "\\n", "")


results <- list("model" = rule_string,
"candidate_models" = candidate_models,
"cutoff_sets" = cutoff_sets)
"candidate_models" = candidate_models,
"cutoff_sets" = cutoff_sets)

result <- results[resultType]

Expand All @@ -249,6 +254,7 @@ trainExplore <- function(train_data = NULL,
#' @param Maximize One of list with strings, list = "ACCURACY", ...
#' @param Accuracy Float 0-1 -> default = 0 (if 0, make empty = computationally more beneficial)
#' @param Specificity float 0-1, default = 0
#' @param OutputMethod string EVERY, BEST, INCREMENT
#' @param PrintSettings True or False
#' @param PrintPerformance True or False
#' @param Subsumption True or False
Expand All @@ -269,17 +275,18 @@ settingsExplore <- function(settings,
ClassFeature,
PositiveClass,
FeatureInclude = "",
Maximize = "ACCURACY",
Maximize = "BALANCEDACCURACY",
Accuracy = 0,
BalancedAccuracy = 0,
Specificity = 0,
OutputMethod = "BEST",
PrintSettings = "yes",
PrintPerformance = "yes",
PrintCutoffSets = "yes",
Subsumption = "yes",
PrintPerformance = "no",
PrintCutoffSets = "no",
Subsumption = "no",
BranchBound = "yes",
Parallel = "no",
OutputMethod = "EVERY",
ParallelMethod = "TWO",
BinaryReduction = "no") {


Expand Down Expand Up @@ -308,6 +315,7 @@ settingsExplore <- function(settings,
settings <- changeSetting(settings, parameter = "Subsumption", input = Subsumption)
settings <- changeSetting(settings, parameter = "BranchBound", input = BranchBound)
settings <- changeSetting(settings, parameter = "Parallel", input = Parallel)
settings <- changeSetting(settings, parameter = "ParallelMethod", input = ParallelMethod)
settings <- changeSetting(settings, parameter = "OutputMethod", input = OutputMethod)
settings <- changeSetting(settings, parameter = "BinaryReduction", input = BinaryReduction)

Expand Down Expand Up @@ -339,6 +347,11 @@ predictExplore <- function(model, test_data) {
return(NULL)
}

# Clean string
model <- stringr::str_remove_all(model, '\"')
model <- stringr::str_replace_all(model, "=", "==")
model <- stringr::str_replace_all(model, "<=", "<") # to correct initial case <= -> <== -> <=

# Split string
all_terms <- stringr::str_split_fixed(model, "OR", n=Inf)

Expand All @@ -360,12 +373,59 @@ predictExplore <- function(model, test_data) {
data_model <- cbind(data_model, as.integer(col==length(all_literals)))
}

colnames(data_model) <- all_terms
colnames(data_model) <- all_terms # TODO: CHECK HERE WHY DATA_MODEL NO COLUMNS
predictions <- as.integer(rowSums(data_model)>0)

return(predictions)
}

#' Return a set of results from EXPLORE output file
#' @param outputFile outputfile = paste0(output_path, file_name, ".result")
#'
#' @export
resultsExplore <- function(outputFile) {

# Read in results file
results <- paste(readLines(outputFile), collapse="\n")
results_lines <- strsplit(results, "\n") %>% unlist()

result <- list()

for (line in results_lines) {
# line <- "Candidate model: '198124209' = \"0\""
if (grepl(":", line)) {
if (grepl("Candidate model", line)) {
split_line <- strsplit(line, ":")[[1]]
key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .)
value <- trimws(split_line[2])
result[[key]] <- c(result[[key]], value)
} else {
split_line <- strsplit(line, ":")[[1]]
key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .)
value <- trimws(split_line[2])
result[[key]] <- value
}
}
}

return(result)
}

#' Return the number of candidate rules for EXPLORE
#' @param OutputFile output file = paste0(output_path, file_name, ".result")
#'
#' @export
candidateNumberExplore <- function(OutputFile) {

# Read in results file
results <- paste(readLines(OutputFile), collapse="\n")

num_candidates <- stringr::str_extract_all(results, "Total Count Candidates \\(incl constraints\\):.*?\u000A")[[1]]
num_candidates <- as.data.frame(stringr::str_remove_all(num_candidates, "Total Count Candidates \\(incl constraints\\):"))
num_candidates <- stringr::str_replace_all(num_candidates, "\\n", "")

return(as.numeric(num_candidates))
}

#' modelsCurveExplore # TODO: update documentation?
#'
Expand All @@ -386,19 +446,23 @@ modelsCurveExplore <- function(train_data = NULL,
StartRulelength = 1,
EndRulelength = 3,
OperatorMethod = "EXHAUSTIVE",
CutoffMethod = "RVAC",
CutoffMethod = "ALL",
ClassFeature = "'class'",
PositiveClass = "'Iris-versicolor'",
FeatureInclude = "",
Maximize = "ACCURACY",
Maximize = "BALANCEDACCURACY",
Accuracy = 0,
BalancedAccuracy = 0,
Specificity = 0,
OutputMethod = "BEST",
PrintSettings = TRUE,
PrintPerformance = TRUE,
Subsumption = TRUE,
PrintPerformance = FALSE,
Subsumption = FALSE,
BranchBound = TRUE,
Parallel = FALSE) {
Sorted = "none",
Parallel = TRUE,
ParallelMethod = "TWO",
BinaryReduction = FALSE) {
# TODO: only input required variables?

# Range of specificities to check
Expand All @@ -418,9 +482,9 @@ modelsCurveExplore <- function(train_data = NULL,
ClassFeature = ClassFeature, PositiveClass = PositiveClass,
FeatureInclude = FeatureInclude, Maximize = "SENSITIVITY",
Accuracy = Accuracy, BalancedAccuracy = BalancedAccuracy, Specificity = constraint,
PrintSettings = PrintSettings, PrintPerformance = PrintPerformance,
OutputMethod = OutputMethod, PrintSettings = PrintSettings, PrintPerformance = PrintPerformance,
Subsumption = Subsumption, BranchBound = BranchBound,
Parallel = Parallel)
Parallel = Parallel, ParallelMethod = ParallelMethod)

return(model)
})
Expand Down
5 changes: 4 additions & 1 deletion R/testExplore.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
testExplore <- function(dataset = "iris", StartRulelength = 2, EndRulelength = 2, BinaryReduction = FALSE) {
testExplore <- function(dataset = "iris",
StartRulelength = 2,
EndRulelength = 2,
BinaryReduction = FALSE) {
# dataset = "iris"
# dataset = "binary_3"
# dataset = "binary_10"
Expand Down
9 changes: 4 additions & 5 deletions inst/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ltbb")
# set(CMAKE_PREFIX_PATH "/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8")
# find_library(TBB_LIB tbb)
# find_path(TBB_PATH
# HINTS /opt/intel/oneapi/tbb/latest/include
# NAMES tbb/parallel_for.h)
# HINTS /opt/intel/oneapi/tbb/latest/include
# NAMES tbb/parallel_for.h)

set(SOURCE_FILES
Clion/main.cpp
../src/C++/CMExplore/cmdline.h
Expand Down Expand Up @@ -93,8 +93,7 @@ set(SOURCE_FILES
../src/C++/IOExplore/IOExplore.h
../src/C++/common.cpp
../src/C++/common.h
../src/C++/stl.h
../src/C++/stlpmt.lib)
../src/C++/stl.h)

add_executable(Explore ${SOURCE_FILES})

Expand Down
9 changes: 4 additions & 5 deletions inst/examples/complexity/binary_10.project
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ IncrementalOutputFile=false
[Setup]
PartitionMethod=RESUBSTITUTION
Randomize=no
StartRulelength=3
StartRulelength=1
EndRulelength=3
LearnRatio=0.8
NumberofPartitions=1
Expand All @@ -34,17 +34,16 @@ PrintCutoffMethod=no
PrintCutoffValues=no
PrintOperatorMethod=no
PrintOperatorValues=no
PrintCombinations=yes
PrintFeatureSets=yes
PrintCombinations=no
PrintFeatureSets=no
PrintCutoffSets=no
PrintCutOffsetsBestLength=no
PrintPerformance=yes
PrintSets=no
SavePartitions=no
[Run]
Subsumption=no
BranchBound=no
Parallel=no
ParallelMethod=ONE
ParallelMethod=TWO
BinaryReduction=no

7 changes: 3 additions & 4 deletions inst/examples/complexity/binary_3.project
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,16 @@ PrintCutoffMethod=no
PrintCutoffValues=no
PrintOperatorMethod=no
PrintOperatorValues=no
PrintCombinations=yes
PrintFeatureSets=yes
PrintCombinations=no
PrintFeatureSets=no
PrintCutoffSets=no
PrintCutOffsetsBestLength=no
PrintPerformance=yes
PrintSets=no
SavePartitions=no
[Run]
Subsumption=no
BranchBound=no
Parallel=no
ParallelMethod=ONE
ParallelMethod=TWO
BinaryReduction=no

Loading

0 comments on commit a3a1e0b

Please sign in to comment.