Merge branch 'main' of https://github.com/jeremiedb/EvoLinear.jl

jeremiedb · Oct 5, 2023 · db55427 · db55427
2 parents 02a4679 + f8bdce8
commit db55427
Show file tree

Hide file tree

Showing 35 changed files with 1,699 additions and 534 deletions.
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
@@ -3,14 +3,42 @@ on:
   schedule:
     - cron: 0 0 * * *
   workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
   CompatHelper:
     runs-on: ubuntu-latest
     steps:
-      - name: Pkg.add("CompatHelper")
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
diff --git a/Project.toml b/Project.toml
@@ -1,24 +1,32 @@
 name = "EvoLinear"
 uuid = "ab853011-1780-437f-b4b5-5de6f4777246"
 authors = ["jeremie <[email protected]> and contributors"]
-version = "0.3.0"
+version = "0.4.3"
 
 [deps]
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
+Distributions = "0.25"
+Flux = "0.13, 0.14"
 LoopVectorization = "0.12"
 MLJModelInterface = "1.0"
-StatsBase = "0.33"
+Optimisers = "0.2, 0.3"
+StatsBase = "0.33, 0.34"
 julia = "1.6"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
 test = ["Test", "MLJBase"]
diff --git a/README.md b/README.md
@@ -52,3 +52,12 @@ m = EvoLinear.fit(config; x, y, metric=:mse)
 p = EvoLinear.predict_proj(m, x)
 p = m(x)
 ```
+
+Splines - Experimental
+
+Number of knots for selected features is defined through a `Dict` of the form: `Dict(feat_id::Int => nknots::Int)`.
+```julia
+config = EvoSplineRegressor(loss=:mse, nrounds=10, knots = Dict(1 => 4, 5 => 8))
+m = EvoLinear.fit(config; x, y, metric=:mse)
+p = m(x')
+```
diff --git a/experiments/MLJ-spline.jl b/experiments/MLJ-spline.jl
@@ -0,0 +1,78 @@
+using Revise
+using EvoLinear
+using EvoLinear: logit, sigmoid
+using StatsBase: sample
+using MLJBase
+
+##################################################
+### Regression - small data
+##################################################
+features = rand(10_000) .* 5 .- 2
+X = reshape(features, (size(features)[1], 1))
+Y = sin.(features) .* 0.5 .+ 0.5
+Y = logit(Y) + randn(size(Y))
+Y = sigmoid(Y)
+y = Y
+X = MLJBase.table(X)
+
+# linear regression
+model = EvoSplineRegressor(loss=:mse, nrounds=10, knots = Dict(1 => 4))
+mach = machine(model, X, y)
+train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split
+fit!(mach, rows=train, verbosity=1)
+
+mach.model.nrounds += 2
+fit!(mach, rows=train, verbosity=1)
+mach.cache[:info][:nrounds]
+
+# predict on train data
+pred_train = predict(mach, selectrows(X, train))
+mean(abs.(pred_train - selectrows(Y, train)))
+
+# predict on test data
+pred_test = predict(mach, selectrows(X, test))
+mean(abs.(pred_test - selectrows(Y, test)))
+
+@test MLJBase.iteration_parameter(EvoLinearRegressor) == :nrounds
+
+
+##################################################
+### Regression - matrix data
+##################################################
+X = MLJBase.matrix(X)
+model = EvoLinearRegressor(loss=:logistic, nrounds=4)
+
+mach = machine(model, X, y)
+train, test = partition(eachindex(y), 0.7, shuffle=true); # 70:30 split
+fit!(mach, rows=train, verbosity=1)
+
+mach.model.nrounds += 2
+fit!(mach, rows=train, verbosity=1)
+
+pred_train = predict(mach, selectrows(X, train))
+mean(abs.(pred_train - selectrows(Y, train)))
+
+
+####################################################################################
+# tests that `update` handles data correctly in the case of a cold restart:
+####################################################################################
+X = MLJBase.table(rand(5, 2))
+y = rand(5)
+model = EvoLinearRegressor(loss=:mse)
+data = MLJBase.reformat(model, X, y);
+f, c, r = MLJBase.fit(model, 2, data...);
+c[:info]
+model.L2 = 0.1
+model.nrounds += 2
+MLJBase.update(model, 2, f, c, data...)
+c[:info][:nrounds]
+
+X = rand(5, 2)
+y = rand(5)
+model = EvoLinearRegressor(loss=:mse)
+data = MLJBase.reformat(model, X, y);
+f, c, r = MLJBase.fit(model, 2, data...);
+model.L2 = 0.1
+model.nrounds += 2
+MLJBase.update(model, 2, f, c, data...)
+MLJBase.update(model, 2, f, c, data...)
diff --git a/experiments/aicrowd-test.jl b/experiments/aicrowd-test.jl
@@ -0,0 +1,169 @@
+using Revise
+using CSV
+using DataFrames
+using EvoLinear
+using XGBoost
+using StatsBase: sample
+using Random: seed!
+
+using AWS: AWSCredentials, AWSConfig, @service
+@service S3
+aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
+aws_config = AWSConfig(; creds = aws_creds, region = "ca-central-1")
+
+path = "share/data/insurance-aicrowd.csv"
+raw = S3.get_object(
+    "jeremiedb",
+    path,
+    Dict("response-content-type" => "application/octet-stream");
+    aws_config,
+)
+df = DataFrame(CSV.File(raw))
+transform!(df, "claim_amount" => ByRow(x -> x > 0 ? 1.0f0 : 0.0f0) => "event")
+
+target = "event"
+feats = [
+    "vh_age",
+    "vh_value",
+    "vh_speed",
+    "vh_weight",
+    "drv_age1",
+    "pol_no_claims_discount",
+    "pol_coverage",
+    "pol_duration",
+    "pol_sit_duration",
+]
+
+pol_cov_dict = Dict{String,Float64}("Min" => 1, "Med1" => 2, "Med2" => 3, "Max" => 4)
+pol_cov_map(x) = get(pol_cov_dict, x, 4)
+transform!(df, "pol_coverage" => ByRow(pol_cov_map) => "pol_coverage")
+
+setdiff(feats, names(df))
+
+seed!(123)
+nobs = nrow(df)
+id_train = sample(1:nobs, Int(round(0.8 * nobs)), replace = false)
+
+df_train = dropmissing(df[id_train, [feats..., target]])
+df_eval = dropmissing(df[Not(id_train), [feats..., target]])
+
+x_train = Matrix{Float32}(df_train[:, feats])
+x_eval = Matrix{Float32}(df_eval[:, feats])
+y_train = Vector{Float32}(df_train[:, target])
+y_eval = Vector{Float32}(df_eval[:, target])
+
+config = EvoLinearRegressor(
+    T = Float32,
+    loss = :logistic,
+    L1 = 0.0,
+    L2 = 0.0,
+    nrounds = 1000,
+    eta = 0.2,
+)
+
+# @time m = fit_evotree(config; x_train, y_train, print_every_n=25);
+@time m, logger = EvoLinear.fit(
+    config;
+    x_train,
+    y_train,
+    x_eval,
+    y_eval,
+    early_stopping_rounds = 100,
+    print_every_n = 10,
+    metric = :logloss,
+    return_logger = true,
+);
+p_linear = m(x_eval);
+EvoLinear.Metrics.logloss(p_linear, y_eval)
+
+config = EvoSplineRegressor(
+    T = Float32,
+    loss = :logistic,
+    nrounds = 600,
+    eta = 1e-3,
+    knots = Dict(1 => 4, 2 => 4, 3 => 4, 4 => 4, 5 => 4, 6 => 4, 7 => 4, 8 => 4, 9 => 4),
+    act = :elu,
+    batchsize = 4096,
+    device = :cpu,
+)
+@time m, logger = EvoLinear.fit(
+    config;
+    x_train,
+    y_train,
+    x_eval,
+    y_eval,
+    early_stopping_rounds = 50,
+    print_every_n = 10,
+    metric = :logloss,
+    return_logger = true,
+);
+# @time m = EvoLinear.fit(config; x_train, y_train);
+p_spline = m(x_eval')
+# p_spline = m(x_eval' |> EvoLinear.Splines.gpu) |> EvoLinear.Splines.cpu
+EvoLinear.Metrics.logloss(p_spline, y_eval)
+
+params_xgb = Dict(
+    :objective => "reg:logistic",
+    :booster => "gbtree",
+    :eta => 0.05,
+    :max_depth => 4,
+    :lambda => 10.0,
+    :gamma => 0.0,
+    :subsample => 0.5,
+    :colsample_bytree => 0.8,
+    :tree_method => "hist",
+    :max_bin => 32,
+    :print_every_n => 5,
+)
+
+nthread = Threads.nthreads()
+nthread = 8
+
+num_round = 250
+metric_xgb = "logloss"
+
+@info "xgboost train:"
+dtrain = DMatrix(x_train, y_train)
+watchlist = Dict("eval" => DMatrix(x_eval, y_eval))
+@time m_xgb = xgboost(
+    dtrain;
+    watchlist,
+    num_round,
+    nthread = nthread,
+    verbosity = 0,
+    eval_metric = metric_xgb,
+    params_xgb...,
+);
+p_xgb_tree = XGBoost.predict(m_xgb, x_eval)
+
+params_xgb = Dict(
+    :booster => "gblinear",
+    :updater => "shotgun", # shotgun / coord_descent
+    :eta => 1.0,
+    :lambda => 0.0,
+    :objective => "reg:logistic",
+    :print_every_n => 5,
+)
+
+nthread = Threads.nthreads()
+nthread = 8
+
+nrounds = 250
+metrics = ["logloss"]
+
+@info "xgboost train:"
+@time m_xgb = xgboost(
+    x_train,
+    nrounds,
+    label = y_train,
+    param = params_xgb,
+    metrics = metrics,
+    nthread = nthread,
+    silent = 1,
+);
+p_xgb_linear = XGBoost.predict(m_xgb, x_eval)
+
+EvoLinear.Metrics.logloss(p_linear, y_eval)
+EvoLinear.Metrics.logloss(p_spline, y_eval)
+EvoLinear.Metrics.logloss(p_xgb_tree, y_eval)
+EvoLinear.Metrics.logloss(p_xgb_linear, y_eval)
diff --git a/experiments/random-gamma.jl b/experiments/random-gamma.jl
@@ -8,24 +8,24 @@ nobs = 1_000_000
 nfeats = 100
 T = Float32
 
-x = randn(T, nobs, nfeats)
+x_train = randn(T, nobs, nfeats)
 coef = randn(T, nfeats) ./ 5
 bias = 1
 
-y = exp.(x * coef .+ bias .+ rand(T, nobs) * T(0.1))
-maximum(y)
-mean(y)
+y_train = exp.(x_train * coef .+ bias .+ rand(T, nobs) * T(0.1))
+maximum(y_train)
+mean(y_train)
 
 config = EvoLinearRegressor(nrounds=10, loss=:gamma, L1=0e-2, L2=0e-1)
-@time m = EvoLinear.fit(config; x, y, metric=:gamma_deviance)
+@time m = EvoLinear.fit(config; x_train, y_train, metric=:gamma_deviance)
 sum(m.coef .== 0)
 
 config = EvoLinearRegressor(nrounds=10, loss=:gamma, L1=1e-2, L2=1e-1)
-@btime m = EvoLinear.fit(config; x, y, metric=:gamma_deviance);
+@btime m = EvoLinear.fit(config; x_train, y_train, metric=:gamma_deviance);
 
-p = EvoLinear.predict_proj(m, x)
-@time EvoLinear.gamma(p, y)
-@btime EvoLinear.gamma($p, $y);
+p = EvoLinear.predict_proj(m, x_train)
+@time EvoLinear.gamma_deviance(p, y_train)
+@btime EvoLinear.gamma_deviance($p, $y_train);
 
 
 using XGBoost