do v2.7.0 -- see NEWS

Techtonique · Nov 8, 2024 · 0fafdf6 · 0fafdf6
1 parent 8fda364
commit 0fafdf6
Show file tree

Hide file tree

Showing 18 changed files with 1,233 additions and 594 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: learningmachine
 Type: Package
 Title: Machine Learning with Explanations and Uncertainty Quantification
-Version: 2.6.0
-Date: 2024-09-20
+Version: 2.7.0
+Date: 2024-11-08
 Author: T. Moudiki
 Maintainer: T. Moudiki <[email protected]>
 Description: Regression-based Machine Learning with explanations and uncertainty quantification.
@@ -29,5 +29,6 @@ Suggests:
     roxygen2,
     testthat (>= 3.0.0),
     xgboost
+Roxygen: list(markdown = TRUE)
 VignetteBuilder: knitr
 Config/testthat/edition: 3
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: build buildsite check clean coverage docs getwd initialize install installcranpkg installgithubpkg installedpkgs load removepkg render setwd start test usegit
+.PHONY: build buildsite check clean cleanvars coverage docs getwd initialize install installcranpkg installgithubpkg installedpkgs load removepkg render setwd start test usegit
 .DEFAULT_GOAL := help
 
 # The directory where R files are stored
@@ -44,8 +44,21 @@ check: clean setwd ## check package
 clean: ## remove all build, and artifacts
 	rm -f .Rhistory
 	rm -f *.RData
-	rm -f *.Rproj
+	rm -f *.Rproj	
 	rm -rf .Rproj.user
+	rm -f src/*.o
+	rm -f src/*.so
+	rm -f src/*.tmp
+	rm -f vignettes/*.html
+
+cleanvars: setwd ## remove all local variables
+	@read -p "Do you want to remove all local variables in R? (1-yes, 2-no): " choice; \
+	if [ $$choice -eq 1 ]; then \
+		echo "Removing all local variables..."; \
+		Rscript -e "rm(list=ls())"; \
+	else \
+		echo "Keeping the variables."; \
+	fi
 
 coverage: ## get test coverage
 	Rscript -e "devtools::test_coverage('.')"
@@ -60,8 +73,12 @@ docs: clean setwd ## generate docs
 getwd: ## get current directory
 	Rscript -e "getwd()"
 
-install: clean setwd ## install current package
+install: clean setwd docs ## install current package
 	Rscript -e "try(devtools::install('.'), silent = FALSE)"
+	@read -p "Start R session? (y/n): " choice; \
+	if [ "$$choice" = "y" ]; then \
+		$(MAKE) start; \
+	fi
 
 installcranpkg: setwd ## install a package
 	@read -p "Enter the name of package to be installed: " pckg; \
@@ -88,10 +105,14 @@ initialize: setwd ## initialize: install packages devtools, usethis, pkgdown and
 help: ## print menu with all options
 	@python3 -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
 
-load: clean setwd ## load all (when developing the package)
+load: clean setwd docs ## load all and restart (when developing the package)
 	Rscript -e "devtools::load_all('.')"
+	@read -p "Start R session? (y/n): " choice; \
+	if [ "$$choice" = "y" ]; then \
+		$(MAKE) start; \
+	fi
 
-removepkg: ## remove package
+removepkg: clean ## remove package
 	@read -p "Enter the name of package to be removed: " pckg; \
 	if [ -z "$$pckg" ]; then \
 		echo "Package name cannot be empty."; \

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# version 2.6.0 (2024-11-08)
+
+- conformal confidence intervals for average effects
+- use Wilcoxon signed-rank test for nonparametric test on average effects
+
 # version 2.5.0 (2024-09-20)
 
 - tests for nonparametric confidence intervals of average effects (based on rescaled winkler scores)

diff --git a/R/Base.R b/R/Base.R
@@ -197,7 +197,7 @@ Base <-
                          class_name = NULL,
                          class_index = NULL, 
                          y = NULL,
-                         type_ci = c("student", "nonparametric", "bootstrap"),
+                         type_ci = c("student", "nonparametric", "bootstrap", "conformal"),
                          cl = NULL) {
         if (is.null(self$engine) || is.null(self$model) || is.null(self$type))
           stop(paste0(self$name, " must be fitted first (use ", self$name, "$fit())"))
@@ -331,6 +331,23 @@ Base <-
             return(c(0, NA, NA, NA))
           }                    
         }
+
+        foo_conformal_tests <- function(x)
+        {
+          res <- try(conformal_ci_mean(x), 
+                     silent = TRUE)
+          if (!inherits(res, "try-error"))
+          {
+            return(c(
+            as.numeric(res$estimate),
+            res$lower,
+            res$upper,
+            res$pvalue
+          ))
+          } else {
+            return(c(0, NA, NA, NA))
+          }                    
+        }
 
         lower_signif_codes <- c(0, 0.001, 0.01, 0.05, 0.1)
         upper_signif_codes <- c(0.001, 0.01, 0.05, 0.1, 1)
@@ -348,6 +365,9 @@ Base <-
         if (identical(type_ci, "bootstrap"))
           citests <- try(data.frame(t(apply(effects, 2, foo_bootstrap_tests))), silent = TRUE)
 
+        if (identical(type_ci, "conformal"))
+          citests <- try(data.frame(t(apply(effects, 2, foo_conformal_tests))), silent = TRUE)
+
         #misc::debug_print(citests)
 
         if (!inherits(citests, "try-error"))
@@ -363,9 +383,11 @@ Base <-
               if (self$type == "regression")
               {
                 coverage_rate <- 100 * mean((y >= as.numeric(preds$lower)) * (y <= as.numeric(preds$upper)))
+
                 R_squared <- 1 - sum((y - preds$preds) ^ 2) / sum((y - mean(y)) ^ 2)
                 R_squared_adj <-
                   1 - (1 - R_squared) * (length(y) - 1) / (length(y) - ncol(X) - 1)
+
                 Residuals <- y - preds$preds
                 return(list(
                   R_squared = R_squared,

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -45,6 +45,10 @@ find_lam_eigen <- function(Eigenvectors, Eigenvalues, y, reg_lambda_vector) {
     .Call(`_learningmachine_find_lam_eigen`, Eigenvectors, Eigenvalues, y, reg_lambda_vector)
 }
 
+fastSampleCpp <- function(resids, n) {
+    .Call(`_learningmachine_fastSampleCpp`, resids, n)
+}
+
 empirical_quantile_cpp <- function(x, q) {
     .Call(`_learningmachine_empirical_quantile_cpp`, x, q)
 }

diff --git a/R/utils_all.R b/R/utils_all.R
@@ -748,6 +748,13 @@ t_test_from_ci <- function(mean_estimate, lower_bound, upper_bound, df, conf_lev
   return(p_value)
 }
 
+# Non-parametric test using Wilcoxon Signed-Rank Test
+wilcoxon_test <- function(x, mu_0 = 0) {
+  # Perform a one-sample Wilcoxon signed-rank test
+  test_result <- wilcox.test(x, mu = mu_0)
+  return(test_result$p.value)
+}
+
 compute_ci_mean <- function(xx, 
                             type_split = c("random", 
                                            "sequential"), 
@@ -793,12 +800,7 @@ compute_ci_mean <- function(xx,
   estimate <- stats::median(pseudo_means_x)
   lower <- quantile(pseudo_means_x, probs = 1 - upper_prob)
   upper <- quantile(pseudo_means_x, probs = upper_prob)
-  pvalue <- t_test_from_ci(mean_estimate = estimate, 
-                           lower_bound = lower, 
-                           upper_bound = upper, 
-                           df = length(pseudo_means_x) - 1,
-                           conf_level = level/100, 
-                           mu_0 = 0)
+  pvalue <- wilcoxon_test(pseudo_means_x)
 
   return(list(
     estimate = estimate,
@@ -813,7 +815,7 @@ compute_ci_mean <- function(xx,
 bootstrap_ci_mean <- function(xx, level = 95, seed = 123) {
 
   set.seed(seed)
-  B <- 500L
+  B <- 1000L
   alpha <- 1 - level / 100
 
   bootstrap_means <- rep(0, B)  
@@ -824,19 +826,58 @@ bootstrap_ci_mean <- function(xx, level = 95, seed = 123) {
   }
 
   # Step 2: Calculate the confidence interval
-  estimate <- mean(xx)
+  estimate <- median(bootstrap_means)
   lower_bound <- quantile(bootstrap_means, alpha / 2)
   upper_bound <- quantile(bootstrap_means, 1 - alpha / 2)  
-  pvalue <- t_test_from_ci(mean_estimate = estimate, 
-                           lower_bound = lower_bound, 
-                           upper_bound = upper_bound, 
-                           df = length(xx) - 1,
-                           conf_level = level/100, 
-                           mu_0 = 0)
+  pvalue <- wilcoxon_test(bootstrap_means)
 
   return(list(
     estimate = estimate,
     lower = lower_bound,
     upper = upper_bound,    
     pvalue = pvalue))
+}
+
+
+boot_means_matrix <- function(B, mean_estimate, resids, seed) {
+  set.seed(seed)
+  n <- length(resids)
+  # Add mean_estimate to each resampled residual and compute means
+  colMeans(matrix(resids[matrix(sample(n, size = n * B, replace = TRUE), nrow = n, ncol = B)], 
+  nrow = n, ncol = B) + mean_estimate)
+}
+
+conformal_ci_mean <- function(xx, level = 95, seed = 123, cpp=TRUE) {
+  set.seed(seed)
+  alpha <- 1 - (level / 100)
+  n_xx <- length(xx)
+  half_n_xx <- n_xx %/% 2
+  idx_train <- sample(seq_len(n_xx), replace = FALSE, size = half_n_xx)
+
+  # Calculate the mean estimate on the sampled training set
+  mean_estimate <- mean(xx[idx_train])
+
+  # Calculate residuals from the remaining data points
+  resids <- xx[-idx_train] - mean_estimate
+
+  # Bootstrap the residuals to estimate the variability
+  B <- 500L  # Number of bootstrap samples
+
+  if (cpp)
+  {
+    boot_samples <- matrix(fastSampleCpp(resids, B * length(resids)), ncol = B)
+  } else {
+    boot_samples <- matrix(sample(resids, size = B * length(resids), replace = TRUE), ncol = B)
+  }
+  boot_means <- colMeans(mean_estimate + boot_samples)
+
+  # Derive the confidence interval from the bootstrap distribution of means
+  lower <- as.numeric(quantile(boot_means, alpha / 2))
+  upper <- as.numeric(quantile(boot_means, 1 - alpha / 2))
+
+  return(list(
+    estimate = median(boot_means),
+    lower = lower,
+    upper =  upper,
+    pvalue = wilcoxon_test(boot_means)))
 }
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -7,5 +7,5 @@ articles:
   getting-updates-bayesianrvfl: getting-updates-bayesianrvfl.html
   getting-updates: getting-updates.html
   qrnn: qrnn.html
-last_built: 2024-09-18T16:57Z
+last_built: 2024-09-21T03:05Z
 
diff --git a/man/Base.Rd b/man/Base.Rd
diff --git a/man/Classifier.Rd b/man/Classifier.Rd