add scaling to qmedist(), mmedist()

lbbe-software · Oct 25, 2024 · d7f1e57 · d7f1e57
1 parent 4fa8de2
commit d7f1e57
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 17 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -9,7 +9,7 @@ BUG FIX
 
 - mgedist() may suffer a numerical issue for Anderson-Darling GoF metrics. All GoF metrics now take care of numerical issue, such as log(0) or 1/0, and are properly scaled by the sample sized to avoid large sample size issues. Thanks for Ethan Chapman for reporting the bug.
 - the default starting value for the gamma distribution was wrongly computed for the rate parameter. Thanks for Wendy Martin for reporting the bug.
-- mledist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues.
+- mledist(), mmedist(), qmedist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues.
 
 # fitdistrplus 1.2-1
 

diff --git a/R/mmedist.R b/R/mmedist.R
@@ -386,7 +386,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL,
         (momemp - momtheo)^2
       }
       fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights)
-        sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) )
+        mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) )
     }else
     {
       DIFF2 <- function(par, fix.arg, order, obs, mdistnam, memp, weights)
@@ -396,7 +396,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL,
         (momemp - momtheo)^2
       }
       fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights)
-        sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) )
+        mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) )
     }
 
     cens <- FALSE

diff --git a/R/qmedist.R b/R/qmedist.R
@@ -133,7 +133,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL,
     }
 
     fnobj <- function(par, fix.arg, obs, qdistnam, qtype)
-      sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
+      mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
 
   }else if (!cens && !is.null(weights)) 
   {
@@ -144,7 +144,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL,
       (qemp - qtheo)^2
     }
     fnobj <- function(par, fix.arg, obs, qdistnam, qtype)
-      sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
+      mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
   }
 
   # Function to calculate the loglikelihood to return

diff --git a/tests/t-mledist-nocens.R b/tests/t-mledist-nocens.R
@@ -114,7 +114,7 @@ x <- c(rnorm(1000, 5),  rnorm(1000, 10))
 #MLE fit
 fit1 <- mledist(x, "norm2", start=list(w=1/3, m1=4, s1=2, m2=8, s2=2), 
                 lower=c(0, 0, 0, 0, 0))
-
+fit1
 
 #fitted coef around 0.5003298  4.9842719  0.9909527 10.0296973  1.0024444 , fitted loglik -4185.114
 
@@ -307,8 +307,6 @@ x <- c(rep(0, n), rpois(n, 10), rpois(n, 50))
 mledist(x, "pois", optim.method="Nelder-Mead", control=list(maxit=10))
 
 
-
-
 # (15) basic fit of a normal distribution with new fix.arg/start.arg
 #
 

diff --git a/tests/t-mmedist.R b/tests/t-mmedist.R
@@ -8,6 +8,8 @@ set.seed(1234)
 x1 <- rnorm(n=nsample)
 mmedist(x1,"norm")
 
+#fitted coef is -0.3831574  0.9446870, fitted loglik is -13.62037
+
 try(mmedist(x1,"norm", fix.arg=list(mean=0)))
 
 # (2) fit a discrete distribution (Poisson)
@@ -17,14 +19,16 @@ set.seed(1234)
 x2 <- rpois(n=nsample, lambda = 2)
 mmedist(x2,"pois")
 
+#fitted coef is 1.7, fitted loglik is -15.31626
+
 # (3) fit a finite-support distribution (beta)
 #
 
 set.seed(1234)
 x3 <- rbeta(n=nsample, shape1=5, shape2=10)
 mmedist(x3,"beta")
 
-
+#fitted coef is 3.956081 9.512364, fitted loglik is 6.939847
 
 # (4) fit a Pareto distribution
 #
@@ -42,6 +46,8 @@ if(any(installed.packages()[, "Package"] == "actuar"))
   #fit
   mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), 
           lower=1, upper=Inf)
+  #fitted coef is 127.4319  64.2936, fitted loglik is -3.689438
+
   mmedist(x4, "pareto", order=1, memp=memp, start=list(shape=10), fix.arg=list(scale=1.5), 
           lower=2, upper=Inf)
   mmedist(x4, "pareto", order=1, memp=memp, start=function(x) list(shape=10), fix.arg=list(scale=1.5), 
@@ -78,6 +84,7 @@ s2 <- log(1+var(x3)/mean(x3)^2*(n-1)/n)
 mu <- log(mean(x3)) - s2/2
 cbind(c(mu, s2), f2$estimate)
 
+#fitted coef is -0.1123774  1.1839216, fitted loglik is -12.99279
 
 c(truestim=exp(mu+s2/2), 
   jensen=as.numeric(exp(f1$estimate["meanlog"]+f1$estimate["sdlog"]^2/2)), 
@@ -124,6 +131,8 @@ fitdistrplus:::wtdvar(x1, w)
 
 mmedist(exp(x1), "lnorm", weights=w)$estimate
 
+#fitted coef is -0.4199772  0.6276551, fitted loglik is -26.82328
+
 #test non integer weights
 try(mmedist(x1, "norm", weights=rep(1/3, length(x1))))
 try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.001), start=list(lambda=mean(x))))
@@ -132,7 +141,7 @@ try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.0000001), start=list(lambda=mea
 
 # (8) fit of a neg binom distribution with weighted moment matching estimation
 #
-
+set.seed(1234)
 x4 <- rnbinom(nsample, 5, 1/2)
 table(x4)
 w <- rep(1, length(x4))
@@ -141,6 +150,8 @@ w[x4 > 5] <- 2
 mmedist(x4, "nbinom", weights=w)$estimate
 mmedist(x4, "nbinom", weights=NULL)$estimate
 
+#fitted coef is 12.980769  4.615385, fitted loglik is -29.67728
+
 # (9) relevant example for zero modified geometric distribution
 #
 
@@ -207,6 +218,9 @@ w[x5 > 5] <- 2
 
 mmedist(x5, "zmgeom", order=1:2, memp=memp1, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])), 
         lower=0.01, upper=0.99)$estimate
+
+#fitted coef is 0.4528775 0.1402878 , fitted loglik is -26.80729
+
 mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])),  
         weights=w)$estimate
 
@@ -221,6 +235,7 @@ mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/m
 if(any(installed.packages()[, "Package"] == "actuar"))
 {
   require("actuar")
+  set.seed(1234)
   #simulate a sample
   x4 <- rpareto(nsample, 6, 2)
 
@@ -231,4 +246,25 @@ if(any(installed.packages()[, "Package"] == "actuar"))
   #fit
   mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "L-BFGS-B") #L-BFGS-B via optim
   mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "Nelder") #Nelder Mead via constrOptim
+
+  #fitted coef is 4.294391 1.597783, fitted loglik is -2.173544
 }
+
+
+# (11) large sample size issue
+
+if(FALSE)
+{
+  set.seed(123)
+  obs <- rlnorm(1e7, 3, 2)
+  for(i in 2:7)
+    cat(i, try(mmedist(obs[1:10^i], "lnorm", control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n")
+
+  # 2 3.795472 1.370517 
+  # 3 3.569512 1.67947 
+  # 4 3.241516 1.881192 
+  # 5 3.179215 1.902745 
+  # 6 2.993209 2.004211 
+  # 7 2.996089 2.002622 
+}
+
diff --git a/tests/t-msedist.R b/tests/t-msedist.R
@@ -35,11 +35,15 @@ legend("bottomright", lty=1:2, col=c("green", "blue"), leg=c("theoretical value"
 
 msedist(x1, "exp", control=list(trace=0, REPORT=1))
 
+#fitted coef is 1.336924, fitted loglik is -5.610213
+
 mse_exp <- fitdist(x1, "exp", method="mse")
 plot(mse_exp)
 summary(mse_exp)
 gofstat(mse_exp)
 
+#fitted coef is -0.1486435  0.9892013, fitted loglik is -13.92195
+
 mse_exp_boot <- bootdist(mse_exp, niter = nboot)
 plot(mse_exp_boot)
 abline(v=1, col="green")
@@ -88,6 +92,8 @@ optim(c(2,2), Dn)
 msedist(x1, "lnorm", control=list(trace=0, REPORT=1))
 
 
+#fitted coef is -0.5939281  0.6723368, fitted loglik is -2.380166
+
 mse_lnorm <- fitdist(x1, "lnorm", method="mse")
 mle_lnorm <- fitdist(x1, "lnorm", method="mle")
 plot(mse_lnorm)
@@ -113,6 +119,7 @@ optim(c(1,1,10), Dn)
 
 msedist(x1, "burr", start=list(shape1=1, shape2=1, rate=10), control=list(trace=0, REPORT=1))
 
+#fitted coef is 0.609447 3.343173 3.811287, fitted loglik is -3.023078
 
 mse_burr <- fitdist(x1, "burr", method="mse", start=list(shape1=1, shape2=1, rate=10))
 mle_burr <- fitdist(x1, "burr", method="mle", start=list(shape1=1, shape2=1, rate=10))

diff --git a/tests/t-qmedist.R b/tests/t-qmedist.R
@@ -8,6 +8,7 @@ set.seed(1234)
 x1 <- rnorm(n=nsample)
 qmedist(x1, "norm", probs=c(1/3, 2/3))
 
+#fitted coef is -0.1486435  0.9892013, fitted loglik is -13.92195
 
 # (2) defining your own distribution functions, here for the Gumbel 
 # distribution for other distributions, see the CRAN task view dedicated 
@@ -17,20 +18,25 @@ dgumbel <- function(x, a, b) 1/b*exp((a-x)/b)*exp(-exp((a-x)/b))
 qgumbel <- function(p, a, b) a - b*log(-log(p))
 qmedist(x1, "gumbel", probs=c(1/3, 2/3), start=list(a=10,b=5))
 
+#fitted coef is -0.4935571  0.8557281, fitted loglik is -16.79828
+
 # (3) fit a discrete distribution (Poisson)
 #
 
 set.seed(1234)
 x2 <- rpois(n=nsample,lambda = 2)
 qmedist(x2, "pois", probs=1/2)
 
+#fitted coef is 1.7, fitted loglik is -15.31626
+
 # (4) fit a finite-support distribution (beta)
 #
 
 set.seed(1234)
 x3 <- rbeta(n=nsample, shape1=5, shape2=10)
 qmedist(x3, "beta", probs=c(1/3, 2/3))
 
+#fitted coef is 4.010999 8.397853, fitted loglik is 6.642124
 
 # (5) fit frequency distributions on USArrests dataset.
 #
@@ -39,6 +45,8 @@ x4 <- USArrests$Assault
 qmedist(x4, "pois", probs=1/2)
 qmedist(x4, "nbinom", probs=c(1/3, 2/3))
 
+#fitted coef is 2.518966 182.313344, fitted loglik is 292.5969
+
 # (6) normal mixture
 #
 
@@ -65,7 +73,8 @@ fit2 <- qmedist(x, "norm2", probs=c(1/6, 1/4, 1/3, 1/2, 2/3),
 	start=list(poid=1/3, m1=4, s1=2, m2=8, s2=2), 
 	lower=c(0, 0, 0, 0, 0), upper=c(1/2, Inf, Inf, Inf, Inf))
 
-
+fit2
+#fitted coef is 0.3433528 4.2872449 0.3891135 9.2598612 1.7948554, fitted loglik is -38.14106
 
 # (7) test error messages
 #
@@ -106,14 +115,11 @@ x <- rexp(nsample)
 f3 <- qmedist(x, "exp", probs=1/2)
 w4 <- c(rep(1, nsample/2), round(sqrt(1:(nsample/2))))
 f4 <- qmedist(x, "exp", weights=w4, probs=1/2)
-f3$estimate
-f4$estimate
+c(f3$estimate, f4$estimate)
 
-f3$loglik
-f4$loglik
+c(f3$loglik, f4$loglik)
 
-median(x)
-median(tail(x, 50))
+#fitted coef is 0.4816191, fitted loglik is -16.95355
 
 
 #try non integer weights
@@ -128,3 +134,22 @@ qmedist(x, "norm", probs=1:2/3, control=list(maxit=2), start=list(mean=1e5, sd=1
 x <- rnorm(nsample)
 qmedist(x, "norm", probs=1:2/3, optim.method="L-BFGS-B", lower=c(-Inf, 0)) #via optim
 qmedist(x, "norm", probs=1:2/3, optim.method="Nelder", lower=c(-Inf, 0)) #via constrOptim
+
+
+# (11) large sample size issue
+
+if(FALSE)
+{
+  set.seed(123)
+  obs <- rlnorm(1e7, 3, 2)
+  for(i in 2:7)
+    cat(i, try(qmedist(obs[1:10^i], "lnorm", probs=1:2/3, control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n")
+
+  # 2 3.109257 1.767013 
+  # 3 3.022615 1.857885 
+  # 4 2.999376 1.978701 
+  # 5 3.003344 2.00941 
+  # 6 2.99881 2.001733 
+  # 7 2.999859 2.000227 
+}
+