Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremiedb committed Oct 5, 2023
2 parents 02a4679 + f8bdce8 commit db55427
Show file tree
Hide file tree
Showing 35 changed files with 1,699 additions and 534 deletions.
36 changes: 32 additions & 4 deletions .github/workflows/CompatHelper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,42 @@ on:
schedule:
- cron: 0 0 * * *
workflow_dispatch:
permissions:
contents: write
pull-requests: write
jobs:
CompatHelper:
runs-on: ubuntu-latest
steps:
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main()
- name: Check if Julia is already available in the PATH
id: julia_in_path
run: which julia
continue-on-error: true
- name: Install Julia, but only if it is not already available in the PATH
uses: julia-actions/setup-julia@v1
with:
version: '1'
arch: ${{ runner.arch }}
if: steps.julia_in_path.outcome != 'success'
- name: "Add the General registry via Git"
run: |
import Pkg
ENV["JULIA_PKG_SERVER"] = ""
Pkg.Registry.add("General")
shell: julia --color=yes {0}
- name: "Install CompatHelper"
run: |
import Pkg
name = "CompatHelper"
uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
version = "3"
Pkg.add(; name, uuid, version)
shell: julia --color=yes {0}
- name: "Run CompatHelper"
run: |
import CompatHelper
CompatHelper.main()
shell: julia --color=yes {0}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
run: julia -e 'using CompatHelper; CompatHelper.main()'
14 changes: 11 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,24 +1,32 @@
name = "EvoLinear"
uuid = "ab853011-1780-437f-b4b5-5de6f4777246"
authors = ["jeremie <[email protected]> and contributors"]
version = "0.3.0"
version = "0.4.3"

[deps]
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
Distributions = "0.25"
Flux = "0.13, 0.14"
LoopVectorization = "0.12"
MLJModelInterface = "1.0"
StatsBase = "0.33"
Optimisers = "0.2, 0.3"
StatsBase = "0.33, 0.34"
julia = "1.6"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "MLJBase"]
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,12 @@ m = EvoLinear.fit(config; x, y, metric=:mse)
p = EvoLinear.predict_proj(m, x)
p = m(x)
```

Splines - Experimental

Number of knots for selected features is defined through a `Dict` of the form: `Dict(feat_id::Int => nknots::Int)`.
```julia
config = EvoSplineRegressor(loss=:mse, nrounds=10, knots = Dict(1 => 4, 5 => 8))
m = EvoLinear.fit(config; x, y, metric=:mse)
p = m(x')
```
78 changes: 78 additions & 0 deletions experiments/MLJ-spline.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
using Revise
using EvoLinear
using EvoLinear: logit, sigmoid
using StatsBase: sample
using MLJBase

##################################################
### Regression - small data
##################################################
features = rand(10_000) .* 5 .- 2
X = reshape(features, (size(features)[1], 1))
Y = sin.(features) .* 0.5 .+ 0.5
Y = logit(Y) + randn(size(Y))
Y = sigmoid(Y)
y = Y
X = MLJBase.table(X)

# linear regression
model = EvoSplineRegressor(loss=:mse, nrounds=10, knots = Dict(1 => 4))
mach = machine(model, X, y)
train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split
fit!(mach, rows=train, verbosity=1)

mach.model.nrounds += 2
fit!(mach, rows=train, verbosity=1)
mach.cache[:info][:nrounds]

# predict on train data
pred_train = predict(mach, selectrows(X, train))
mean(abs.(pred_train - selectrows(Y, train)))

# predict on test data
pred_test = predict(mach, selectrows(X, test))
mean(abs.(pred_test - selectrows(Y, test)))

@test MLJBase.iteration_parameter(EvoLinearRegressor) == :nrounds


##################################################
### Regression - matrix data
##################################################
X = MLJBase.matrix(X)
model = EvoLinearRegressor(loss=:logistic, nrounds=4)

mach = machine(model, X, y)
train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split
fit!(mach, rows=train, verbosity=1)

mach.model.nrounds += 2
fit!(mach, rows=train, verbosity=1)

pred_train = predict(mach, selectrows(X, train))
mean(abs.(pred_train - selectrows(Y, train)))


####################################################################################
# tests that `update` handles data correctly in the case of a cold restart:
####################################################################################
X = MLJBase.table(rand(5, 2))
y = rand(5)
model = EvoLinearRegressor(loss=:mse)
data = MLJBase.reformat(model, X, y);
f, c, r = MLJBase.fit(model, 2, data...);
c[:info]
model.L2 = 0.1
model.nrounds += 2
MLJBase.update(model, 2, f, c, data...)
c[:info][:nrounds]

X = rand(5, 2)
y = rand(5)
model = EvoLinearRegressor(loss=:mse)
data = MLJBase.reformat(model, X, y);
f, c, r = MLJBase.fit(model, 2, data...);
model.L2 = 0.1
model.nrounds += 2
MLJBase.update(model, 2, f, c, data...)
MLJBase.update(model, 2, f, c, data...)
169 changes: 169 additions & 0 deletions experiments/aicrowd-test.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
using Revise
using CSV
using DataFrames
using EvoLinear
using XGBoost
using StatsBase: sample
using Random: seed!

using AWS: AWSCredentials, AWSConfig, @service
@service S3
aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
aws_config = AWSConfig(; creds = aws_creds, region = "ca-central-1")

path = "share/data/insurance-aicrowd.csv"
raw = S3.get_object(
"jeremiedb",
path,
Dict("response-content-type" => "application/octet-stream");
aws_config,
)
df = DataFrame(CSV.File(raw))
transform!(df, "claim_amount" => ByRow(x -> x > 0 ? 1.0f0 : 0.0f0) => "event")

target = "event"
feats = [
"vh_age",
"vh_value",
"vh_speed",
"vh_weight",
"drv_age1",
"pol_no_claims_discount",
"pol_coverage",
"pol_duration",
"pol_sit_duration",
]

pol_cov_dict = Dict{String,Float64}("Min" => 1, "Med1" => 2, "Med2" => 3, "Max" => 4)
pol_cov_map(x) = get(pol_cov_dict, x, 4)
transform!(df, "pol_coverage" => ByRow(pol_cov_map) => "pol_coverage")

setdiff(feats, names(df))

seed!(123)
nobs = nrow(df)
id_train = sample(1:nobs, Int(round(0.8 * nobs)), replace = false)

df_train = dropmissing(df[id_train, [feats..., target]])
df_eval = dropmissing(df[Not(id_train), [feats..., target]])

x_train = Matrix{Float32}(df_train[:, feats])
x_eval = Matrix{Float32}(df_eval[:, feats])
y_train = Vector{Float32}(df_train[:, target])
y_eval = Vector{Float32}(df_eval[:, target])

config = EvoLinearRegressor(
T = Float32,
loss = :logistic,
L1 = 0.0,
L2 = 0.0,
nrounds = 1000,
eta = 0.2,
)

# @time m = fit_evotree(config; x_train, y_train, print_every_n=25);
@time m, logger = EvoLinear.fit(
config;
x_train,
y_train,
x_eval,
y_eval,
early_stopping_rounds = 100,
print_every_n = 10,
metric = :logloss,
return_logger = true,
);
p_linear = m(x_eval);
EvoLinear.Metrics.logloss(p_linear, y_eval)

config = EvoSplineRegressor(
T = Float32,
loss = :logistic,
nrounds = 600,
eta = 1e-3,
knots = Dict(1 => 4, 2 => 4, 3 => 4, 4 => 4, 5 => 4, 6 => 4, 7 => 4, 8 => 4, 9 => 4),
act = :elu,
batchsize = 4096,
device = :cpu,
)
@time m, logger = EvoLinear.fit(
config;
x_train,
y_train,
x_eval,
y_eval,
early_stopping_rounds = 50,
print_every_n = 10,
metric = :logloss,
return_logger = true,
);
# @time m = EvoLinear.fit(config; x_train, y_train);
p_spline = m(x_eval')
# p_spline = m(x_eval' |> EvoLinear.Splines.gpu) |> EvoLinear.Splines.cpu
EvoLinear.Metrics.logloss(p_spline, y_eval)

params_xgb = Dict(
:objective => "reg:logistic",
:booster => "gbtree",
:eta => 0.05,
:max_depth => 4,
:lambda => 10.0,
:gamma => 0.0,
:subsample => 0.5,
:colsample_bytree => 0.8,
:tree_method => "hist",
:max_bin => 32,
:print_every_n => 5,
)

nthread = Threads.nthreads()
nthread = 8

num_round = 250
metric_xgb = "logloss"

@info "xgboost train:"
dtrain = DMatrix(x_train, y_train)
watchlist = Dict("eval" => DMatrix(x_eval, y_eval))
@time m_xgb = xgboost(
dtrain;
watchlist,
num_round,
nthread = nthread,
verbosity = 0,
eval_metric = metric_xgb,
params_xgb...,
);
p_xgb_tree = XGBoost.predict(m_xgb, x_eval)

params_xgb = Dict(
:booster => "gblinear",
:updater => "shotgun", # shotgun / coord_descent
:eta => 1.0,
:lambda => 0.0,
:objective => "reg:logistic",
:print_every_n => 5,
)

nthread = Threads.nthreads()
nthread = 8

nrounds = 250
metrics = ["logloss"]

@info "xgboost train:"
@time m_xgb = xgboost(
x_train,
nrounds,
label = y_train,
param = params_xgb,
metrics = metrics,
nthread = nthread,
silent = 1,
);
p_xgb_linear = XGBoost.predict(m_xgb, x_eval)

EvoLinear.Metrics.logloss(p_linear, y_eval)
EvoLinear.Metrics.logloss(p_spline, y_eval)
EvoLinear.Metrics.logloss(p_xgb_tree, y_eval)
EvoLinear.Metrics.logloss(p_xgb_linear, y_eval)
18 changes: 9 additions & 9 deletions experiments/random-gamma.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,24 @@ nobs = 1_000_000
nfeats = 100
T = Float32

x = randn(T, nobs, nfeats)
x_train = randn(T, nobs, nfeats)
coef = randn(T, nfeats) ./ 5
bias = 1

y = exp.(x * coef .+ bias .+ rand(T, nobs) * T(0.1))
maximum(y)
mean(y)
y_train = exp.(x_train * coef .+ bias .+ rand(T, nobs) * T(0.1))
maximum(y_train)
mean(y_train)

config = EvoLinearRegressor(nrounds=10, loss=:gamma, L1=0e-2, L2=0e-1)
@time m = EvoLinear.fit(config; x, y, metric=:gamma_deviance)
@time m = EvoLinear.fit(config; x_train, y_train, metric=:gamma_deviance)
sum(m.coef .== 0)

config = EvoLinearRegressor(nrounds=10, loss=:gamma, L1=1e-2, L2=1e-1)
@btime m = EvoLinear.fit(config; x, y, metric=:gamma_deviance);
@btime m = EvoLinear.fit(config; x_train, y_train, metric=:gamma_deviance);

p = EvoLinear.predict_proj(m, x)
@time EvoLinear.gamma(p, y)
@btime EvoLinear.gamma($p, $y);
p = EvoLinear.predict_proj(m, x_train)
@time EvoLinear.gamma_deviance(p, y_train)
@btime EvoLinear.gamma_deviance($p, $y_train);


using XGBoost
Expand Down
Loading

0 comments on commit db55427

Please sign in to comment.