Skip to content

Commit

Permalink
add scaling to qmedist(), mmedist()
Browse files Browse the repository at this point in the history
  • Loading branch information
dutangc committed Oct 25, 2024
1 parent 4fa8de2 commit d7f1e57
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 17 deletions.
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ BUG FIX

- mgedist() may suffer a numerical issue for Anderson-Darling GoF metrics. All GoF metrics now take care of numerical issue, such as log(0) or 1/0, and are properly scaled by the sample sized to avoid large sample size issues. Thanks for Ethan Chapman for reporting the bug.
- the default starting value for the gamma distribution was wrongly computed for the rate parameter. Thanks for Wendy Martin for reporting the bug.
- mledist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues.
- mledist(), mmedist(), qmedist() may suffer a scaling issue and the objective function is properly scaled by the sample sized to avoid large sample size issues.

# fitdistrplus 1.2-1

Expand Down
4 changes: 2 additions & 2 deletions R/mmedist.R
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL,
(momemp - momtheo)^2
}
fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights)
sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) )
mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp)) )
}else
{
DIFF2 <- function(par, fix.arg, order, obs, mdistnam, memp, weights)
Expand All @@ -396,7 +396,7 @@ mmedist <- function (data, distr, order, memp, start=NULL, fix.arg=NULL,
(momemp - momtheo)^2
}
fnobj <- function(par, fix.arg, obs, mdistnam, memp, weights)
sum( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) )
mean( sapply(order, function(o) DIFF2(par, fix.arg, o, obs, mdistnam, memp, weights)) )
}

cens <- FALSE
Expand Down
4 changes: 2 additions & 2 deletions R/qmedist.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL,
}

fnobj <- function(par, fix.arg, obs, qdistnam, qtype)
sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )

}else if (!cens && !is.null(weights))
{
Expand All @@ -144,7 +144,7 @@ qmedist <- function (data, distr, probs, start=NULL, fix.arg=NULL,
(qemp - qtheo)^2
}
fnobj <- function(par, fix.arg, obs, qdistnam, qtype)
sum( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
mean( sapply(probs, function(p) DIFF2Q(par, fix.arg, p, obs, qdistnam, qtype)) )
}

# Function to calculate the loglikelihood to return
Expand Down
4 changes: 1 addition & 3 deletions tests/t-mledist-nocens.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ x <- c(rnorm(1000, 5), rnorm(1000, 10))
#MLE fit
fit1 <- mledist(x, "norm2", start=list(w=1/3, m1=4, s1=2, m2=8, s2=2),
lower=c(0, 0, 0, 0, 0))

fit1

#fitted coef around 0.5003298 4.9842719 0.9909527 10.0296973 1.0024444 , fitted loglik -4185.114

Expand Down Expand Up @@ -307,8 +307,6 @@ x <- c(rep(0, n), rpois(n, 10), rpois(n, 50))
mledist(x, "pois", optim.method="Nelder-Mead", control=list(maxit=10))




# (15) basic fit of a normal distribution with new fix.arg/start.arg
#

Expand Down
40 changes: 38 additions & 2 deletions tests/t-mmedist.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ set.seed(1234)
x1 <- rnorm(n=nsample)
mmedist(x1,"norm")

#fitted coef is -0.3831574 0.9446870, fitted loglik is -13.62037

try(mmedist(x1,"norm", fix.arg=list(mean=0)))

# (2) fit a discrete distribution (Poisson)
Expand All @@ -17,14 +19,16 @@ set.seed(1234)
x2 <- rpois(n=nsample, lambda = 2)
mmedist(x2,"pois")

#fitted coef is 1.7, fitted loglik is -15.31626

# (3) fit a finite-support distribution (beta)
#

set.seed(1234)
x3 <- rbeta(n=nsample, shape1=5, shape2=10)
mmedist(x3,"beta")


#fitted coef is 3.956081 9.512364, fitted loglik is 6.939847

# (4) fit a Pareto distribution
#
Expand All @@ -42,6 +46,8 @@ if(any(installed.packages()[, "Package"] == "actuar"))
#fit
mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10),
lower=1, upper=Inf)
#fitted coef is 127.4319 64.2936, fitted loglik is -3.689438

mmedist(x4, "pareto", order=1, memp=memp, start=list(shape=10), fix.arg=list(scale=1.5),
lower=2, upper=Inf)
mmedist(x4, "pareto", order=1, memp=memp, start=function(x) list(shape=10), fix.arg=list(scale=1.5),
Expand Down Expand Up @@ -78,6 +84,7 @@ s2 <- log(1+var(x3)/mean(x3)^2*(n-1)/n)
mu <- log(mean(x3)) - s2/2
cbind(c(mu, s2), f2$estimate)

#fitted coef is -0.1123774 1.1839216, fitted loglik is -12.99279

c(truestim=exp(mu+s2/2),
jensen=as.numeric(exp(f1$estimate["meanlog"]+f1$estimate["sdlog"]^2/2)),
Expand Down Expand Up @@ -124,6 +131,8 @@ fitdistrplus:::wtdvar(x1, w)

mmedist(exp(x1), "lnorm", weights=w)$estimate

#fitted coef is -0.4199772 0.6276551, fitted loglik is -26.82328

#test non integer weights
try(mmedist(x1, "norm", weights=rep(1/3, length(x1))))
try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.001), start=list(lambda=mean(x))))
Expand All @@ -132,7 +141,7 @@ try(mmedist(1:10, "pois", weights=c(rep(1, 9), 1.0000001), start=list(lambda=mea

# (8) fit of a neg binom distribution with weighted moment matching estimation
#

set.seed(1234)
x4 <- rnbinom(nsample, 5, 1/2)
table(x4)
w <- rep(1, length(x4))
Expand All @@ -141,6 +150,8 @@ w[x4 > 5] <- 2
mmedist(x4, "nbinom", weights=w)$estimate
mmedist(x4, "nbinom", weights=NULL)$estimate

#fitted coef is 12.980769 4.615385, fitted loglik is -29.67728

# (9) relevant example for zero modified geometric distribution
#

Expand Down Expand Up @@ -207,6 +218,9 @@ w[x5 > 5] <- 2

mmedist(x5, "zmgeom", order=1:2, memp=memp1, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])),
lower=0.01, upper=0.99)$estimate

#fitted coef is 0.4528775 0.1402878 , fitted loglik is -26.80729

mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/mean(x5[x5 > 0])),
weights=w)$estimate

Expand All @@ -221,6 +235,7 @@ mmedist(x5, "zmgeom", order=1:2, memp=memp2, start=list(p1=mean(x5 == 0), p2=1/m
if(any(installed.packages()[, "Package"] == "actuar"))
{
require("actuar")
set.seed(1234)
#simulate a sample
x4 <- rpareto(nsample, 6, 2)

Expand All @@ -231,4 +246,25 @@ if(any(installed.packages()[, "Package"] == "actuar"))
#fit
mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "L-BFGS-B") #L-BFGS-B via optim
mmedist(x4, "pareto", order=c(1, 2), memp=memp, start=c(shape=10, scale=10), lower=1, upper=Inf, optim.method = "Nelder") #Nelder Mead via constrOptim

#fitted coef is 4.294391 1.597783, fitted loglik is -2.173544
}


# (11) large sample size issue

if(FALSE)
{
set.seed(123)
obs <- rlnorm(1e7, 3, 2)
for(i in 2:7)
cat(i, try(mmedist(obs[1:10^i], "lnorm", control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n")

# 2 3.795472 1.370517
# 3 3.569512 1.67947
# 4 3.241516 1.881192
# 5 3.179215 1.902745
# 6 2.993209 2.004211
# 7 2.996089 2.002622
}

7 changes: 7 additions & 0 deletions tests/t-msedist.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,15 @@ legend("bottomright", lty=1:2, col=c("green", "blue"), leg=c("theoretical value"

msedist(x1, "exp", control=list(trace=0, REPORT=1))

#fitted coef is 1.336924, fitted loglik is -5.610213

mse_exp <- fitdist(x1, "exp", method="mse")
plot(mse_exp)
summary(mse_exp)
gofstat(mse_exp)

#fitted coef is -0.1486435 0.9892013, fitted loglik is -13.92195

mse_exp_boot <- bootdist(mse_exp, niter = nboot)
plot(mse_exp_boot)
abline(v=1, col="green")
Expand Down Expand Up @@ -88,6 +92,8 @@ optim(c(2,2), Dn)
msedist(x1, "lnorm", control=list(trace=0, REPORT=1))


#fitted coef is -0.5939281 0.6723368, fitted loglik is -2.380166

mse_lnorm <- fitdist(x1, "lnorm", method="mse")
mle_lnorm <- fitdist(x1, "lnorm", method="mle")
plot(mse_lnorm)
Expand All @@ -113,6 +119,7 @@ optim(c(1,1,10), Dn)

msedist(x1, "burr", start=list(shape1=1, shape2=1, rate=10), control=list(trace=0, REPORT=1))

#fitted coef is 0.609447 3.343173 3.811287, fitted loglik is -3.023078

mse_burr <- fitdist(x1, "burr", method="mse", start=list(shape1=1, shape2=1, rate=10))
mle_burr <- fitdist(x1, "burr", method="mle", start=list(shape1=1, shape2=1, rate=10))
Expand Down
39 changes: 32 additions & 7 deletions tests/t-qmedist.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set.seed(1234)
x1 <- rnorm(n=nsample)
qmedist(x1, "norm", probs=c(1/3, 2/3))

#fitted coef is -0.1486435 0.9892013, fitted loglik is -13.92195

# (2) defining your own distribution functions, here for the Gumbel
# distribution for other distributions, see the CRAN task view dedicated
Expand All @@ -17,20 +18,25 @@ dgumbel <- function(x, a, b) 1/b*exp((a-x)/b)*exp(-exp((a-x)/b))
qgumbel <- function(p, a, b) a - b*log(-log(p))
qmedist(x1, "gumbel", probs=c(1/3, 2/3), start=list(a=10,b=5))

#fitted coef is -0.4935571 0.8557281, fitted loglik is -16.79828

# (3) fit a discrete distribution (Poisson)
#

set.seed(1234)
x2 <- rpois(n=nsample,lambda = 2)
qmedist(x2, "pois", probs=1/2)

#fitted coef is 1.7, fitted loglik is -15.31626

# (4) fit a finite-support distribution (beta)
#

set.seed(1234)
x3 <- rbeta(n=nsample, shape1=5, shape2=10)
qmedist(x3, "beta", probs=c(1/3, 2/3))

#fitted coef is 4.010999 8.397853, fitted loglik is 6.642124

# (5) fit frequency distributions on USArrests dataset.
#
Expand All @@ -39,6 +45,8 @@ x4 <- USArrests$Assault
qmedist(x4, "pois", probs=1/2)
qmedist(x4, "nbinom", probs=c(1/3, 2/3))

#fitted coef is 2.518966 182.313344, fitted loglik is 292.5969

# (6) normal mixture
#

Expand All @@ -65,7 +73,8 @@ fit2 <- qmedist(x, "norm2", probs=c(1/6, 1/4, 1/3, 1/2, 2/3),
start=list(poid=1/3, m1=4, s1=2, m2=8, s2=2),
lower=c(0, 0, 0, 0, 0), upper=c(1/2, Inf, Inf, Inf, Inf))


fit2
#fitted coef is 0.3433528 4.2872449 0.3891135 9.2598612 1.7948554, fitted loglik is -38.14106

# (7) test error messages
#
Expand Down Expand Up @@ -106,14 +115,11 @@ x <- rexp(nsample)
f3 <- qmedist(x, "exp", probs=1/2)
w4 <- c(rep(1, nsample/2), round(sqrt(1:(nsample/2))))
f4 <- qmedist(x, "exp", weights=w4, probs=1/2)
f3$estimate
f4$estimate
c(f3$estimate, f4$estimate)

f3$loglik
f4$loglik
c(f3$loglik, f4$loglik)

median(x)
median(tail(x, 50))
#fitted coef is 0.4816191, fitted loglik is -16.95355


#try non integer weights
Expand All @@ -128,3 +134,22 @@ qmedist(x, "norm", probs=1:2/3, control=list(maxit=2), start=list(mean=1e5, sd=1
x <- rnorm(nsample)
qmedist(x, "norm", probs=1:2/3, optim.method="L-BFGS-B", lower=c(-Inf, 0)) #via optim
qmedist(x, "norm", probs=1:2/3, optim.method="Nelder", lower=c(-Inf, 0)) #via constrOptim


# (11) large sample size issue

if(FALSE)
{
set.seed(123)
obs <- rlnorm(1e7, 3, 2)
for(i in 2:7)
cat(i, try(qmedist(obs[1:10^i], "lnorm", probs=1:2/3, control=list(trace=0, REPORT=1))$estimate, silent=TRUE), "\n")

# 2 3.109257 1.767013
# 3 3.022615 1.857885
# 4 2.999376 1.978701
# 5 3.003344 2.00941
# 6 2.99881 2.001733
# 7 2.999859 2.000227
}

0 comments on commit d7f1e57

Please sign in to comment.