From b47c4d440665a0c711c14a792f92a27cbf95b2fb Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Fri, 5 Apr 2024 00:46:58 -0400
Subject: [PATCH 1/6] test

---
 src/loss.jl    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 src/metrics.jl | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/src/loss.jl b/src/loss.jl
index 30a716a..67e5e2c 100644
--- a/src/loss.jl
+++ b/src/loss.jl
@@ -31,6 +31,49 @@ function logloss(m, x, y, w, offset)
     sum(w .* ((1 .- y) .* p .- logσ.(p))) / sum(w)
 end
 
+function tweedie(m, x, y)
+    rho = eltype(x)(1.5)
+    p = m(x)
+    mean(2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
+               p .^ (2 - rho) / (2 - rho))
+    )
+end
+function tweedie(m, x, y, w)
+    rho = eltype(x)(1.5)
+    p = m(x)
+    sum(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
+                   p .^ (2 - rho) / (2 - rho))
+    ) / sum(w)
+end
+function tweedie(m, x, y, w, offset)
+    rho = eltype(x)(1.5)
+    p = m(x) .+ offset
+    sum(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
+                   p .^ (2 - rho) / (2 - rho))
+    ) / sum(w)
+end
+
+
+function tweedie(
+    p::AbstractMatrix{T},
+    y::AbstractVector,
+    w::AbstractVector,
+    eval::AbstractVector;
+    kwargs...
+) where {T}
+    @threads for i in eachindex(y)
+        pred = exp(p[1, i])
+        eval[i] =
+            w[i] *
+            2 *
+            (
+                y[i]^(2 - rho) / (1 - rho) / (2 - rho) - y[i] * pred^(1 - rho) / (1 - rho) +
+                pred^(2 - rho) / (2 - rho)
+            )
+    end
+    return sum(eval) / sum(w)
+end
+
 function mlogloss(m, x, y)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
@@ -69,6 +112,7 @@ const _loss_fn_dict = Dict(
     :mse => mse,
     :mae => mae,
     :logloss => logloss,
+    :tweedie => tweedie,
     :mlogloss => mlogloss,
     :gaussian_mle => gaussian_mle,
 )
diff --git a/src/metrics.jl b/src/metrics.jl
index 4fecba1..6eb5150 100644
--- a/src/metrics.jl
+++ b/src/metrics.jl
@@ -64,6 +64,50 @@ function logloss(m, x, y, w, offset; agg=mean)
 end
 
 
+"""
+    logloss(x, y; agg=mean)
+    logloss(x, y, w; agg=mean)
+    logloss(x, y, w, offset; agg=mean)
+"""
+function logloss(m, x, y; agg=mean)
+    p = m(x)
+    metric = agg((1 .- y) .* p .- logσ.(p))
+    return metric
+end
+function logloss(m, x, y, w; agg=mean)
+    p = m(x)
+    metric = agg(((1 .- y) .* p .- logσ.(p)) .* w)
+    return metric
+end
+function logloss(m, x, y, w, offset; agg=mean)
+    p = m(x) .+ offset
+    metric = agg(((1 .- y) .* p .- logσ.(p)) .* w)
+    return metric
+end
+
+function tweedie(m, x, y; agg=mean)
+    rho = eltype(x)(1.5)
+    p = m(x)
+    agg(2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
+              p .^ (2 - rho) / (2 - rho))
+    )
+end
+function tweedie(m, x, y, w)
+    agg = mean
+    rho = eltype(x)(1.5)
+    p = m(x)
+    agg(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
+                   p .^ (2 - rho) / (2 - rho))
+    )
+end
+function tweedie(m, x, y, w, offset; agg=mean)
+    rho = eltype(x)(1.5)
+    p = m(x) .+ offset
+    agg(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
+                   p .^ (2 - rho) / (2 - rho))
+    )
+end
+
 """
     mlogloss(x, y; agg=mean)
     mlogloss(x, y, w; agg=mean)
@@ -141,6 +185,7 @@ const metric_dict = Dict(
     :mse => mse,
     :mae => mae,
     :logloss => logloss,
+    :tweedie => tweedie,
     :mlogloss => mlogloss,
     :gaussian_mle => gaussian_mle,
 )
@@ -148,6 +193,7 @@ const metric_dict = Dict(
 is_maximise(::typeof(mse)) = false
 is_maximise(::typeof(mae)) = false
 is_maximise(::typeof(logloss)) = false
+is_maximise(::typeof(tweedie)) = false
 is_maximise(::typeof(mlogloss)) = false
 is_maximise(::typeof(gaussian_mle)) = true
 

From ce584b33c9c699578bb709e37c3191b456c571cc Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Fri, 19 Apr 2024 15:18:02 -0400
Subject: [PATCH 2/6] tweedie

---
 benchmarks/MSRank-tweedie.jl | 89 ++++++++++++++++++++++++++++++++++++
 src/loss.jl                  | 35 +++-----------
 src/metrics.jl               | 38 +++++----------
 src/model.jl                 |  2 +
 4 files changed, 109 insertions(+), 55 deletions(-)
 create mode 100644 benchmarks/MSRank-tweedie.jl

diff --git a/benchmarks/MSRank-tweedie.jl b/benchmarks/MSRank-tweedie.jl
new file mode 100644
index 0000000..18dd3fd
--- /dev/null
+++ b/benchmarks/MSRank-tweedie.jl
@@ -0,0 +1,89 @@
+using Revise
+using Random
+using CSV
+using DataFrames
+using StatsBase
+using Statistics: mean, std
+using NeuroTreeModels
+using Solage: Connectors
+using ReadLIBSVM
+using AWS: AWSCredentials, AWSConfig, @service
+
+# https://www.microsoft.com/en-us/research/project/mslr/
+
+@service S3
+aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
+aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
+bucket = "jeremiedb"
+
+# initial prep
+function read_libsvm_aws(file::String; has_query=false, aws_config=AWSConfig())
+    raw = S3.get_object("jeremiedb", file, Dict("response-content-type" => "application/octet-stream"); aws_config)
+    return read_libsvm(raw; has_query)
+end
+
+@time train_raw = read_libsvm_aws("share/data/msrank/train.txt"; has_query=true, aws_config);
+@time eval_raw = read_libsvm_aws("share/data/msrank/vali.txt"; has_query=true, aws_config);
+@time test_raw = read_libsvm_aws("share/data/msrank/test.txt"; has_query=true, aws_config);
+
+dtrain = DataFrame(train_raw[:x], :auto)
+dtrain.y_raw = train_raw[:y]
+dtrain.y = dtrain.y_raw ./ 4
+dtrain.q = train_raw[:q]
+
+deval = DataFrame(eval_raw[:x], :auto)
+deval.y_raw = eval_raw[:y]
+deval.y = deval.y_raw ./ 4
+deval.q = eval_raw[:q]
+
+dtest = DataFrame(test_raw[:x], :auto)
+dtest.y_raw = test_raw[:y]
+dtest.y = dtest.y_raw ./ 4
+dtest.q = test_raw[:q]
+
+feature_names = setdiff(names(dtrain), ["y", "y_raw", "q"])
+target_name = "y_raw"
+
+function percent_rank(x::AbstractVector{T}) where {T}
+    return tiedrank(x) / (length(x) + 1)
+end
+
+transform!(dtrain, feature_names .=> percent_rank .=> feature_names)
+transform!(deval, feature_names .=> percent_rank .=> feature_names)
+transform!(dtest, feature_names .=> percent_rank .=> feature_names)
+
+config = NeuroTreeRegressor(
+    device=:gpu,
+    loss=:tweedie_deviance,
+    nrounds=2,
+    actA=:tanh,
+    outsize=1,
+    depth=4,
+    ntrees=64,
+    stack_size=2,
+    hidden_size=16,
+    batchsize=4096,
+    lr=3e-4,
+)
+
+@time m, logger = NeuroTreeModels.fit(
+    config,
+    dtrain;
+    deval,
+    target_name,
+    feature_names,
+    print_every_n=1,
+    early_stopping_rounds=3,
+    metric=:tweedie_deviance,
+    return_logger=true
+);
+
+dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
+p_eval = m(dinfer_eval);
+mse_eval = mean((p_eval .- deval.y_raw) .^ 2)
+@info "MSE - deval" mse_eval
+
+dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
+p_test = m(dinfer_test);
+mse_test = mean((p_test .- dtest.y_raw) .^ 2)
+@info "MSE - dtest" mse_test
diff --git a/src/loss.jl b/src/loss.jl
index 67e5e2c..f87c1e6 100644
--- a/src/loss.jl
+++ b/src/loss.jl
@@ -31,49 +31,28 @@ function logloss(m, x, y, w, offset)
     sum(w .* ((1 .- y) .* p .- logσ.(p))) / sum(w)
 end
 
-function tweedie(m, x, y)
+function tweedie_deviance(m, x, y)
     rho = eltype(x)(1.5)
-    p = m(x)
+    p = exp.(m(x))
     mean(2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
                p .^ (2 - rho) / (2 - rho))
     )
 end
-function tweedie(m, x, y, w)
+function tweedie_deviance(m, x, y, w)
     rho = eltype(x)(1.5)
-    p = m(x)
+    p = exp.(m(x))
     sum(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
                    p .^ (2 - rho) / (2 - rho))
     ) / sum(w)
 end
-function tweedie(m, x, y, w, offset)
+function tweedie_deviance(m, x, y, w, offset)
     rho = eltype(x)(1.5)
-    p = m(x) .+ offset
+    p = exp.(m(x) .+ offset)
     sum(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
                    p .^ (2 - rho) / (2 - rho))
     ) / sum(w)
 end
 
-
-function tweedie(
-    p::AbstractMatrix{T},
-    y::AbstractVector,
-    w::AbstractVector,
-    eval::AbstractVector;
-    kwargs...
-) where {T}
-    @threads for i in eachindex(y)
-        pred = exp(p[1, i])
-        eval[i] =
-            w[i] *
-            2 *
-            (
-                y[i]^(2 - rho) / (1 - rho) / (2 - rho) - y[i] * pred^(1 - rho) / (1 - rho) +
-                pred^(2 - rho) / (2 - rho)
-            )
-    end
-    return sum(eval) / sum(w)
-end
-
 function mlogloss(m, x, y)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
@@ -112,9 +91,9 @@ const _loss_fn_dict = Dict(
     :mse => mse,
     :mae => mae,
     :logloss => logloss,
-    :tweedie => tweedie,
     :mlogloss => mlogloss,
     :gaussian_mle => gaussian_mle,
+    :tweedie_deviance => tweedie_deviance,
 )
 
 get_loss_fn(config::NeuroTreeRegressor) = _loss_fn_dict[config.loss]
diff --git a/src/metrics.jl b/src/metrics.jl
index 6eb5150..60c59d8 100644
--- a/src/metrics.jl
+++ b/src/metrics.jl
@@ -65,44 +65,28 @@ end
 
 
 """
-    logloss(x, y; agg=mean)
-    logloss(x, y, w; agg=mean)
-    logloss(x, y, w, offset; agg=mean)
+    tweedie_deviance(x, y; agg=mean)
+    tweedie_deviance(x, y, w; agg=mean)
+    tweedie_deviance(x, y, w, offset; agg=mean)
 """
-function logloss(m, x, y; agg=mean)
-    p = m(x)
-    metric = agg((1 .- y) .* p .- logσ.(p))
-    return metric
-end
-function logloss(m, x, y, w; agg=mean)
-    p = m(x)
-    metric = agg(((1 .- y) .* p .- logσ.(p)) .* w)
-    return metric
-end
-function logloss(m, x, y, w, offset; agg=mean)
-    p = m(x) .+ offset
-    metric = agg(((1 .- y) .* p .- logσ.(p)) .* w)
-    return metric
-end
-
-function tweedie(m, x, y; agg=mean)
+function tweedie_deviance(m, x, y; agg=mean)
     rho = eltype(x)(1.5)
-    p = m(x)
+    p = exp.(m(x))
     agg(2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
               p .^ (2 - rho) / (2 - rho))
     )
 end
-function tweedie(m, x, y, w)
+function tweedie_deviance(m, x, y, w)
     agg = mean
     rho = eltype(x)(1.5)
-    p = m(x)
+    p = exp.(m(x))
     agg(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
                    p .^ (2 - rho) / (2 - rho))
     )
 end
-function tweedie(m, x, y, w, offset; agg=mean)
+function tweedie_deviance(m, x, y, w, offset; agg=mean)
     rho = eltype(x)(1.5)
-    p = m(x) .+ offset
+    p = exp.(m(x) .+ offset)
     agg(w .* 2 .* (y .^ (2 - rho) / (1 - rho) / (2 - rho) - y .* p .^ (1 - rho) / (1 - rho) +
                    p .^ (2 - rho) / (2 - rho))
     )
@@ -185,16 +169,16 @@ const metric_dict = Dict(
     :mse => mse,
     :mae => mae,
     :logloss => logloss,
-    :tweedie => tweedie,
     :mlogloss => mlogloss,
     :gaussian_mle => gaussian_mle,
+    :tweedie_deviance => tweedie_deviance,
 )
 
 is_maximise(::typeof(mse)) = false
 is_maximise(::typeof(mae)) = false
 is_maximise(::typeof(logloss)) = false
-is_maximise(::typeof(tweedie)) = false
 is_maximise(::typeof(mlogloss)) = false
 is_maximise(::typeof(gaussian_mle)) = true
+is_maximise(::typeof(tweedie_deviance)) = false
 
 end
\ No newline at end of file
diff --git a/src/model.jl b/src/model.jl
index 27e0948..d225922 100644
--- a/src/model.jl
+++ b/src/model.jl
@@ -4,6 +4,7 @@ abstract type MAE <: LossType end
 abstract type LogLoss <: LossType end
 abstract type MLogLoss <: LossType end
 abstract type GaussianMLE <: LossType end
+abstract type TweedieDeviance <: LossType end
 
 const _loss_type_dict = Dict(
     :mse => MSE,
@@ -11,6 +12,7 @@ const _loss_type_dict = Dict(
     :logloss => LogLoss,
     :mlogloss => MLogLoss,
     :gaussian_mle => GaussianMLE,
+    :tweedie_deviance => TweedieDeviance
 )
 
 mutable struct NeuroTreeRegressor <: MMI.Deterministic

From 73471da49e36f2526394a03dc80cf9a625b24d35 Mon Sep 17 00:00:00 2001
From: jeremie <jeremie.desgagne.bouchard@gmail.com>
Date: Sun, 21 Apr 2024 12:45:40 -0400
Subject: [PATCH 3/6] classif fixes

api cleanup
---
 Project.toml                   |   5 +-
 benchmarks/titanic-logloss.jl  |  58 +++++
 benchmarks/titanic-mlogloss.jl |  64 ++++++
 experiments/dataloader.jl      |  52 ++++-
 src/MLJ.jl                     |  31 ++-
 src/NeuroTreeModels.jl         |   5 +-
 src/callback.jl                |  42 +---
 src/data.jl                    | 143 ++++++------
 src/fit.jl                     |  88 ++++----
 src/learners.jl                | 384 +++++++++++++++++++++++++++++++++
 src/loss.jl                    |   3 +-
 src/metrics.jl                 |   2 +-
 src/model.jl                   | 225 +------------------
 test/MLJ.jl                    |  58 ++++-
 test/core.jl                   |  72 +++++--
 15 files changed, 835 insertions(+), 397 deletions(-)
 create mode 100644 benchmarks/titanic-logloss.jl
 create mode 100644 benchmarks/titanic-mlogloss.jl
 create mode 100644 src/learners.jl

diff --git a/Project.toml b/Project.toml
index 9f24418..f404ca7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,11 @@
-authors = ["jeremiedb <jeremie.db@evovest.com>"]
 name = "NeuroTreeModels"
 uuid = "1db4e0a5-a364-4b0c-897c-2bd5a4a3a1f2"
-version = "1.2.0"
+authors = ["jeremiedb <jeremie.db@evovest.com>"]
+version = "1.3.0"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
diff --git a/benchmarks/titanic-logloss.jl b/benchmarks/titanic-logloss.jl
new file mode 100644
index 0000000..ded0388
--- /dev/null
+++ b/benchmarks/titanic-logloss.jl
@@ -0,0 +1,58 @@
+using NeuroTreeModels
+using MLDatasets
+using DataFrames
+using Statistics: mean
+using StatsBase: median
+using CategoricalArrays
+using Random
+using CUDA
+using CategoricalArrays
+
+Random.seed!(123)
+
+df = MLDatasets.Titanic().dataframe
+
+# convert string feature to Categorical
+transform!(df, :Sex => categorical => :Sex)
+transform!(df, :Sex => ByRow(levelcode) => :Sex)
+
+# treat string feature and missing values
+transform!(df, :Age => ByRow(ismissing) => :Age_ismissing)
+transform!(df, :Age => (x -> coalesce.(x, median(skipmissing(x)))) => :Age);
+
+# remove unneeded variables
+df = df[:, Not([:PassengerId, :Name, :Embarked, :Cabin, :Ticket])]
+
+train_ratio = 0.8
+train_indices = randperm(nrow(df))[1:Int(round(train_ratio * nrow(df)))]
+
+dtrain = df[train_indices, :]
+deval = df[setdiff(1:nrow(df), train_indices), :]
+
+target_name = "Survived"
+feature_names = setdiff(names(df), ["Survived"])
+
+config = NeuroTreeRegressor(
+    loss=:logloss,
+    nrounds=400,
+    depth=4,
+    lr=3e-2,
+)
+
+m = NeuroTreeModels.fit(
+    config,
+    dtrain;
+    deval,
+    target_name,
+    feature_names,
+    metric=:logloss,
+    print_every_n=10,
+    early_stopping_rounds=3,
+    device=:cpu
+)
+
+p_train = m(dtrain)
+p_eval = m(deval)
+
+@info mean((p_train .> 0.5) .== (dtrain[!, target_name] .> 0.5))
+@info mean((p_eval .> 0.5) .== (deval[!, target_name] .> 0.5))
diff --git a/benchmarks/titanic-mlogloss.jl b/benchmarks/titanic-mlogloss.jl
new file mode 100644
index 0000000..ba78b48
--- /dev/null
+++ b/benchmarks/titanic-mlogloss.jl
@@ -0,0 +1,64 @@
+using NeuroTreeModels
+using MLDatasets
+using DataFrames
+using Statistics: mean
+using StatsBase: median
+using CategoricalArrays
+using Random
+using CUDA
+using CategoricalArrays
+
+Random.seed!(123)
+
+df = MLDatasets.Titanic().dataframe
+
+# convert target variable to a categorical
+transform!(df, :Survived => categorical => :y_cat)
+
+# convert string feature to Categorical
+transform!(df, :Sex => categorical => :Sex)
+transform!(df, :Sex => ByRow(levelcode) => :Sex)
+
+# treat string feature and missing values
+transform!(df, :Age => ByRow(ismissing) => :Age_ismissing)
+transform!(df, :Age => (x -> coalesce.(x, median(skipmissing(x)))) => :Age);
+
+# remove unneeded variables
+df = df[:, Not([:PassengerId, :Name, :Embarked, :Cabin, :Ticket])]
+
+train_ratio = 0.8
+train_indices = randperm(nrow(df))[1:Int(round(train_ratio * nrow(df)))]
+
+dtrain = df[train_indices, :]
+deval = df[setdiff(1:nrow(df), train_indices), :]
+
+target_name = "y_cat"
+feature_names = setdiff(names(df), ["y_cat", "Survived"])
+
+eltype(dtrain[:, "y_cat"])
+config = NeuroTreeClassifier(
+    nrounds=400,
+    depth=4,
+    lr=3e-2,
+)
+
+m = NeuroTreeModels.fit(
+    config,
+    dtrain;
+    deval,
+    target_name,
+    feature_names,
+    metric=:mlogloss,
+    print_every_n=10,
+    early_stopping_rounds=3,
+    device=:cpu
+)
+
+p_train = m(dtrain)
+p_train_idx = [argmax(p) for p in eachrow(p_train)]
+
+p_eval = m(deval)
+p_eval_idx = [argmax(p) for p in eachrow(p_eval)]
+
+@info mean(p_train_idx .== levelcode.(dtrain[!, target_name]))
+@info mean(p_eval_idx .== levelcode.(deval[!, target_name]))
diff --git a/experiments/dataloader.jl b/experiments/dataloader.jl
index 7370bf5..feedeb8 100644
--- a/experiments/dataloader.jl
+++ b/experiments/dataloader.jl
@@ -1,25 +1,67 @@
 using NeuroTreeModels
 using DataFrames
+using CategoricalArrays
 
 #################################
 # vanilla DataFrame
 #################################
-nobs=100
-nfeats=10
+nobs = 100
+nfeats = 10
 x = rand(nobs, nfeats);
 df = DataFrame(x, :auto);
 df.y = rand(nobs);
 
 target_name = "y"
-feature_names = setdiff(names(df), [target_name])
+feature_names = Symbol.(setdiff(names(df), [target_name]))
+batchsize = 32
+
+###################################
+# CPU
+###################################
+device = :cpu
+dtrain = NeuroTreeModels.get_df_loader_train(df; feature_names, target_name, batchsize, device)
+
+for d in dtrain
+    @info length(d)
+    @info size(d[1])
+end
+
+deval = NeuroTreeModels.get_df_loader_infer(df; feature_names, batchsize=32)
+for d in deval
+    @info size(d)
+end
+
+###################################
+# GPU
+###################################
+device = :gpu
+dtrain = NeuroTreeModels.get_df_loader_train(df; feature_names, target_name, batchsize, device)
 
-dtrain = NeuroTrees.get_df_loader_train(df; feature_names, target_name, batchsize=32)
 for d in dtrain
     @info length(d)
     @info size(d[1])
 end
 
-deval = NeuroTrees.get_df_loader_infer(df; feature_names, batchsize=32)
+deval = NeuroTreeModels.get_df_loader_infer(df; feature_names, batchsize=32)
 for d in deval
     @info size(d)
 end
+
+###################################
+# Categorical
+###################################
+target_name = "y"
+feature_names = Symbol.(setdiff(names(df), [target_name]))
+batchsize = 32
+device = :gpu
+
+x = rand(nobs, nfeats);
+df = DataFrame(x, :auto);
+df.y = categorical(rand(1:2, nobs));
+
+dtrain = NeuroTreeModels.get_df_loader_train(df; feature_names, target_name, batchsize, device)
+for d in dtrain
+    @info length(d)
+    @info size(d[1])
+    @info typeof(d[2])
+end
diff --git a/src/MLJ.jl b/src/MLJ.jl
index 78da31a..9f34806 100644
--- a/src/MLJ.jl
+++ b/src/MLJ.jl
@@ -1,5 +1,5 @@
 function MMI.fit(
-  model::NeuroTreeRegressor,
+  model::NeuroTypes,
   verbosity::Int,
   A,
   y,
@@ -8,7 +8,6 @@ function MMI.fit(
   Tables.istable(A) ? dtrain = DataFrame(A) : error("`A` must be a Table")
   nobs = Tables.DataAPI.nrow(dtrain)
   feature_names = string.(collect(Tables.schema(dtrain).names))
-  @info feature_names
   @assert "_target" ∉ feature_names
   dtrain._target = y
   target_name = "_target"
@@ -22,7 +21,7 @@ function MMI.fit(
   end
   offset_name = nothing
 
-  fitresult, cache = init(model, dtrain; feature_names, target_name, weight_name, offset_name)
+  fitresult, cache = init(model, dtrain; feature_names, target_name, weight_name, offset_name, device=:cpu)
 
   while fitresult.info[:nrounds] < model.nrounds
     fit_iter!(fitresult, cache)
@@ -37,10 +36,10 @@ function okay_to_continue(model, fitresult, cache)
 end
 
 # For EarlyStopping.jl support
-MMI.iteration_parameter(::Type{<:NeuroTreeRegressor}) = :nrounds
+MMI.iteration_parameter(::Type{<:NeuroTypes}) = :nrounds
 
 function MMI.update(
-  model::NeuroTreeRegressor,
+  model::NeuroTypes,
   verbosity::Integer,
   fitresult,
   cache,
@@ -68,9 +67,17 @@ function MMI.predict(::NeuroTreeRegressor, fitresult, A)
   return pred
 end
 
+function predict(::NeuroTreeClassifier, fitresult, A)
+  df = DataFrame(A)
+  Tables.istable(A) ? df = DataFrame(A) : error("`A` must be a Table")
+  dinfer = get_df_loader_infer(df; feature_names=fitresult.info[:feature_names], batchsize=2048, device=:cpu)
+  pred = infer(fitresult, dinfer)
+  return MMI.UnivariateFinite(fitresult.info[:target_levels], pred, pool=missing, ordered=fitresult.info[:target_isordered])
+end
+
 # Metadata
 MMI.metadata_pkg.(
-  (NeuroTreeRegressor),
+  (NeuroTreeRegressor, NeuroTreeClassifier),
   name="NeuroTreeModels",
   uuid="1db4e0a5-a364-4b0c-897c-2bd5a4a3a1f2",
   url="https://github.com/Evovest/NeuroTreeModels.jl",
@@ -81,10 +88,16 @@ MMI.metadata_pkg.(
 
 MMI.metadata_model(
   NeuroTreeRegressor,
-  input_scitype=Union{
-    MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor),
-  },
+  input_scitype=MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor),
   target_scitype=AbstractVector{<:MMI.Continuous},
   weights=true,
   path="NeuroTreeModels.NeuroTreeRegressor",
 )
+
+MMI.metadata_model(
+  NeuroTreeClassifier,
+  input_scitype=MMI.Table(MMI.Continuous, MMI.Count, MMI.OrderedFactor),
+  target_scitype=AbstractVector{<:MMI.Finite},
+  weights=true,
+  path="NeuroTreeModels.NeuroTreeClassifier",
+)
diff --git a/src/NeuroTreeModels.jl b/src/NeuroTreeModels.jl
index 2be8638..3e4a998 100644
--- a/src/NeuroTreeModels.jl
+++ b/src/NeuroTreeModels.jl
@@ -3,6 +3,7 @@ module NeuroTreeModels
 using Base.Threads: @threads, nthreads
 import Tables
 using DataFrames
+using CategoricalArrays
 using Statistics: mean, std
 using Random
 
@@ -19,11 +20,13 @@ using ChainRulesCore
 import ChainRulesCore: rrule
 
 import MLJModelInterface as MMI
+import MLJModelInterface: fit, update, predict, schema
 
-export NeuroTreeRegressor, NeuroTreeModel, NeuroTree
+export NeuroTreeRegressor, NeuroTreeClassifier, NeuroTreeModel, NeuroTree
 
 include("data.jl")
 include("utils.jl")
+include("learners.jl")
 include("model.jl")
 include("loss.jl")
 include("metrics.jl")
diff --git a/src/callback.jl b/src/callback.jl
index 4d70acf..ff940ba 100644
--- a/src/callback.jl
+++ b/src/callback.jl
@@ -4,7 +4,7 @@ using DataFrames
 using Statistics: mean, median
 using Flux: cpu, gpu
 using CUDA: CuIterator
-using ..NeuroTreeModels: NeuroTreeRegressor
+using ..NeuroTreeModels: NeuroTypes
 using ..NeuroTreeModels: get_df_loader_train
 using ..NeuroTreeModels.Metrics
 
@@ -21,43 +21,19 @@ function (cb::CallBack)(logger, iter, m)
     return nothing
 end
 
-# function CallBack(config::NeuroTreeRegressor; metric, x_eval, y_eval, w_eval=nothing, offset_eval=nothing)
-#     feval = metric_dict[metric]
-
-#     y_eval = ndims(y_eval) == 1 ? y_eval : y_eval'
-#     w_eval = isnothing(w_eval) ? ones(Float32, size(y_eval)[end]) : Vector{Float32}(w_eval)
-#     offset_eval = isnothing(offset_eval) ? zeros(Float32, size(y_eval)[end]) : Vector{Float32}(offset_eval)
-
-#     deval = DataLoader(
-#         (
-#             x=Matrix{Float32}(x_eval'),
-#             y=Float32.(y_eval),
-#             w=Float32.(w_eval),
-#             offset = Float32.(offset_eval),
-#         ),
-#         batchsize=config.batchsize,
-#         partial=true,
-#         shuffle=false,
-#         parallel=true,
-#         buffer=false,
-#     )
-#     (config.device == :gpu) && (deval = CuIterator(deval))
-#     return CallBack(feval, deval)
-# end
-
 function CallBack(
-    config::NeuroTreeRegressor,
+    config::NeuroTypes,
     deval::AbstractDataFrame;
     metric,
     feature_names,
     target_name,
     weight_name=nothing,
-    offset_name=nothing)
+    offset_name=nothing,
+    device=:cpu)
 
     batchsize = config.batchsize
     feval = metric_dict[metric]
-    deval = get_df_loader_train(deval; feature_names, target_name, weight_name, offset_name, batchsize)
-    (config.device == :gpu) && (deval = CuIterator(deval))
+    deval = get_df_loader_train(deval; feature_names, target_name, weight_name, offset_name, batchsize, device)
     return CallBack(feval, deval)
 end
 
@@ -96,11 +72,11 @@ end
 function agg_logger(logger_raw::Vector{Dict})
 
     _l1 = first(logger_raw)
-    best_iters =  [d[:best_iter] for d in logger_raw]
-    best_iter =  ceil(Int, median(best_iters))
+    best_iters = [d[:best_iter] for d in logger_raw]
+    best_iter = ceil(Int, median(best_iters))
 
-    best_metrics =  [d[:best_metric] for d in logger_raw]
-    best_metric =  last(best_metrics)
+    best_metrics = [d[:best_metric] for d in logger_raw]
+    best_metric = last(best_metrics)
 
     metrics = (layer=Int[], iter=Int[], metric=Float64[])
     for i in eachindex(logger_raw)
diff --git a/src/data.jl b/src/data.jl
index 04c6d50..247554d 100644
--- a/src/data.jl
+++ b/src/data.jl
@@ -4,56 +4,45 @@ import Base: length, getindex
     ContainerTrain
 
 """
-struct ContainerTrain{D<:AbstractDataFrame}
-    df::D
-    feature_names::Vector{Symbol}
-    target_name::String
-    weight_name::Union{Symbol,Nothing}
-    offset_name::Union{Symbol,Vector{Symbol},Nothing}
+struct ContainerTrain{A<:AbstractMatrix,B<:AbstractVector,C,D}
+    x::A
+    y::B
+    w::C
+    offset::D
 end
 
-function ContainerTrain(
-    df;
-    feature_names::Vector{Symbol},
-    target_name,
-    weight_name=nothing,
-    offset_name=nothing)
+length(data::ContainerTrain) = size(data.x, 2)
 
-    container = ContainerTrain(
-        df,
-        feature_names,
-        target_name,
-        weight_name,
-        offset_name)
-
-    return container
+function getindex(data::ContainerTrain{A,B,C,D}, idx::AbstractVector) where {A,B,C<:Nothing,D<:Nothing}
+    x = data.x[:, idx]
+    y = data.y[idx]
+    return (x, y)
 end
-
-length(data::ContainerTrain{<:AbstractDataFrame}) = nrow(data.df)
-
-function getindex(data::ContainerTrain{<:AbstractDataFrame}, idx::AbstractVector)
-    df = view(data.df, idx, :)
-    x = Matrix{Float32}(Matrix{Float32}(select(df, data.feature_names))')
-    y = Float32.(df[!, data.target_name])
-    if isnothing(data.weight_name) && isnothing(data.offset_name)
-        return (x, y)
-    elseif isnothing(data.offset_name)
-        w = Float32.(df[!, data.weight_name])
-        return (x, y, w)
-    elseif isnothing(data.weight_name)
-        w = ones(Float32, length(y))
-        isa(data.offset_name, String) ? offset = Float32.(df[!, data.offset_name]) : offset = Matrix{Float32}(Matrix{Float32}(df[!, data.offset_name])')
-        return (x, y, w, offset)
-    else
-        w = Float32.(df[!, data.weight_name])
-        isa(data.offset_name, String) ? offset = Float32.(df[!, data.offset_name]) : offset = Matrix{Float32}(Matrix{Float32}(df[!, data.offset_name])')
-        return (x, y, w, offset)
-    end
+function getindex(data::ContainerTrain{A,B,C,D}, idx::AbstractVector) where {A,B,C<:AbstractVector,D<:Nothing}
+    x = data.x[:, idx]
+    y = data.y[idx]
+    w = data.w[idx]
+    return (x, y, w)
+end
+function getindex(data::ContainerTrain{A,B,C,D}, idx::AbstractVector) where {A,B,C<:AbstractVector,D<:AbstractVector}
+    x = data.x[:, idx]
+    y = data.y[idx]
+    w = data.w[idx]
+    offset = data.offset[idx]
+    return (x, y, w, offset)
+end
+function getindex(data::ContainerTrain{A,B,C,D}, idx::AbstractVector) where {A,B,C<:AbstractVector,D<:AbstractMatrix}
+    x = data.x[:, idx]
+    y = data.y[idx]
+    w = data.w[idx]
+    offset = data.offset[:, idx]
+    return (x, y, w, offset)
 end
 
+
 function get_df_loader_train(
     df::AbstractDataFrame;
-    feature_names::Vector{Symbol},
+    feature_names,
     target_name,
     weight_name=nothing,
     offset_name=nothing,
@@ -61,10 +50,27 @@ function get_df_loader_train(
     shuffle=true,
     device=:cpu)
 
-    container = ContainerTrain(df; feature_names, target_name, weight_name, offset_name)
+    feature_names = Symbol.(feature_names)
+    x = Matrix{Float32}(Matrix{Float32}(select(df, feature_names))')
+
+    if eltype(df[!, target_name]) <: CategoricalValue
+        y = UInt32.(CategoricalArrays.levelcode.(df[!, target_name]))
+    else
+        y = Float32.(df[!, target_name])
+    end
+
+    w = isnothing(weight_name) ? nothing : Float32.(df[!, weight_name])
+
+    offset = if isnothing(offset_name)
+        nothing
+    else
+        isa(offset_name, String) ? Float32.(df[!, offset_name]) : offset = Matrix{Float32}(Matrix{Float32}(df[!, data.offset_name])')
+    end
+
+    container = ContainerTrain(x, y, w, offset)
     batchsize = min(batchsize, length(container))
     dtrain = DataLoader(container; shuffle, batchsize, partial=true, parallel=false)
-    if Symbol(device) == :gpu
+    if device == :gpu
         return CuIterator(dtrain)
     else
         return dtrain
@@ -76,32 +82,27 @@ end
     ContainerInfer
 
 """
-struct ContainerInfer{D<:AbstractDataFrame}
-    df::D
-    feature_names::Vector{Symbol}
-    offset_name::Union{Symbol,Nothing}
-end
-
-function ContainerInfer(
-    df;
-    feature_names::Vector{Symbol},
-    offset_name=nothing)
-
-    container = ContainerInfer(
-        df,
-        feature_names,
-        offset_name)
-
-    return container
+struct ContainerInfer{A<:AbstractMatrix,D}
+    x::A
+    offset::D
 end
 
-length(data::ContainerInfer{<:AbstractDataFrame}) = nrow(data.df)
+length(data::ContainerInfer) = size(data.x, 2)
 
-function getindex(data::ContainerInfer{<:AbstractDataFrame}, idx::AbstractVector)
-    df = view(data.df, idx, :)
-    x = Matrix{Float32}(Matrix{Float32}(select(df, data.feature_names))')
+function getindex(data::ContainerInfer{A,D}, idx::AbstractVector) where {A,D<:Nothing}
+    x = data.x[:, idx]
     return x
 end
+function getindex(data::ContainerTrain{A,D}, idx::AbstractVector) where {A,D<:AbstractVector}
+    x = data.x[:, idx]
+    offset = data.offset[idx]
+    return (x, offset)
+end
+function getindex(data::ContainerTrain{A,D}, idx::AbstractVector) where {A,D<:AbstractMatrix}
+    x = data.x[:, idx]
+    offset = data.offset[:, idx]
+    return (x, offset)
+end
 
 function get_df_loader_infer(
     df::AbstractDataFrame;
@@ -111,10 +112,18 @@ function get_df_loader_infer(
     device=:cpu)
 
     feature_names = Symbol.(feature_names)
-    container = ContainerInfer(df; feature_names, offset_name)
+    x = Matrix{Float32}(Matrix{Float32}(select(df, feature_names))')
+
+    offset = if isnothing(offset_name)
+        nothing
+    else
+        isa(offset_name, String) ? Float32.(df[!, offset_name]) : offset = Matrix{Float32}(Matrix{Float32}(df[!, data.offset_name])')
+    end
+
+    container = ContainerInfer(x, offset)
     batchsize = min(batchsize, length(container))
     dinfer = DataLoader(container; shuffle=false, batchsize, partial=true, parallel=false)
-    if Symbol(device) == :gpu
+    if device == :gpu
         return CuIterator(dinfer)
     else
         return dinfer
diff --git a/src/fit.jl b/src/fit.jl
index 9f5d5b3..aa67770 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -1,48 +1,52 @@
 function init(
-    config::NeuroTreeRegressor,
+    config::NeuroTypes,
     df::AbstractDataFrame;
     feature_names,
     target_name,
     weight_name=nothing,
-    offset_name=nothing)
+    offset_name=nothing,
+    device=:cpu,
+)
 
     batchsize = config.batchsize
-    feature_names = Symbol.(feature_names)
-    if config.device == :gpu
-        device = Flux.gpu
-        CUDA.device!(config.gpuID)
-    else
-        device = Flux.cpu
-    end
-
-    dtrain = NeuroTreeModels.get_df_loader_train(df; feature_names, target_name, weight_name, offset_name, batchsize)
-    (config.device == :gpu) && (dtrain = CuIterator(dtrain))
-
     nfeats = length(feature_names)
     loss = get_loss_fn(config)
     L = get_loss_type(config)
-    chain = get_model_chain(L; config, nfeats)
+
+    target_levels = nothing
+    target_isordered = false
+    outsize = 1
+    if L <: MLogLoss
+        eltype(df[!, target_name]) <: CategoricalValue || error("Target variable `$target_name` must have its elements `<: CategoricalValue`")
+        target_levels = CategoricalArrays.levels(df[!, target_name])
+        target_isordered = isordered(df[!, target_name])
+        outsize = length(target_levels)
+    end
+    dtrain = NeuroTreeModels.get_df_loader_train(df; feature_names, target_name, weight_name, offset_name, batchsize, device)
+
+    chain = get_model_chain(L; config, nfeats, outsize)
     info = Dict(
-        :device => config.device,
+        :device => device,
         :nrounds => 0,
-        :feature_names => feature_names
-    )
+        :feature_names => feature_names,
+        :target_levels => target_levels,
+        :target_isordered => target_isordered)
     m = NeuroTreeModel(L, chain, info)
-    if config.device == :gpu
+    if device == :gpu
         m = m |> gpu
     end
 
     optim = OptimiserChain(NAdam(config.lr), WeightDecay(config.wd))
     opts = Optimisers.setup(optim, m)
 
-    cache = (dtrain=dtrain, loss=loss, opts=opts, device=device, info=info)
+    cache = (dtrain=dtrain, loss=loss, opts=opts, info=info)
     return m, cache
 end
 
 
 """
     function fit(
-        config::NeuroTreeRegressor,
+        config::NeuroTypes,
         dtrain;
         feature_names,
         target_name,
@@ -53,15 +57,16 @@ end
         print_every_n=9999,
         early_stopping_rounds=9999,
         verbosity=1,
-        return_logger=false
+        device=:cpu,
+        gpuID=0,
     )
 
 Training function of NeuroTreeModels' internal API.
 
 # Arguments
 
-- `config::NeuroTreeRegressor`
-- `dtrain`: Must be a `AbstractDataFrame`  
+- `config::NeuroTypes`
+- `dtrain`: Must be `<:AbstractDataFrame`  
 
 # Keyword arguments
 
@@ -79,11 +84,12 @@ Training function of NeuroTreeModels' internal API.
 - `print_every_n=9999`
 - `early_stopping_rounds=9999`
 - `verbosity=1`
-- `return_logger=false`
+- `device=:cpu`: device on which to perform the computation, either `:cpu` or `:gpu`
+- `gpuID=0`: gpu device to use, only relveant if `device = :gpu` 
 
 """
 function fit(
-    config::NeuroTreeRegressor,
+    config::NeuroTypes,
     dtrain;
     feature_names,
     target_name,
@@ -94,28 +100,33 @@ function fit(
     print_every_n=9999,
     early_stopping_rounds=9999,
     verbosity=1,
-    return_logger=false
+    device=:cpu,
+    gpuID=0,
 )
 
-    feature_names = Symbol.(feature_names)
-    if config.device == :gpu
-        CUDA.device!(config.gpuID)
+    device = Symbol(device)
+    if device == :gpu
+        CUDA.device!(gpuID)
     end
 
-    # initialize callback and logger if tracking eval data
+    feature_names = Symbol.(feature_names)
+    target_name = Symbol(target_name)
+    weight_name = isnothing(weight_name) ? nothing : Symbol(weight_name)
+    offset_name = isnothing(offset_name) ? nothing : Symbol(offset_name)
     metric = isnothing(metric) ? nothing : Symbol(metric)
+
+    m, cache = init(config, dtrain; feature_names, target_name, weight_name, offset_name, device)
+
+    # initialize callback and logger if tracking eval data
     logging_flag = !isnothing(metric) && !isnothing(deval)
     any_flag = !isnothing(metric) || !isnothing(deval)
     if !logging_flag && any_flag
         @warn "For logger and eval metric to be tracked, `metric` and `deval` must both be provided."
     end
-    logger = Dict[]
-    logger = nothing
-
-    m, cache = init(config, dtrain; feature_names, target_name, weight_name, offset_name)
 
+    logger = nothing
     if logging_flag
-        cb = CallBack(config, deval; metric, feature_names, target_name, weight_name, offset_name)
+        cb = CallBack(config, deval; metric, feature_names, target_name, weight_name, offset_name, device)
         logger = init_logger(; metric, early_stopping_rounds)
         cb(logger, 0, m)
         (verbosity > 0) && @info "Init training" metric = logger[:metrics][end]
@@ -136,11 +147,8 @@ function fit(
         end
     end
 
-    if return_logger
-        return (m, logger)
-    else
-        return m
-    end
+    m.info[:logger] = logger
+    return m
 end
 
 function fit_iter!(m, cache)
diff --git a/src/learners.jl b/src/learners.jl
new file mode 100644
index 0000000..4a5a75f
--- /dev/null
+++ b/src/learners.jl
@@ -0,0 +1,384 @@
+abstract type LossType end
+abstract type MSE <: LossType end
+abstract type MAE <: LossType end
+abstract type LogLoss <: LossType end
+abstract type MLogLoss <: LossType end
+abstract type GaussianMLE <: LossType end
+
+const _loss_type_dict = Dict(
+    :mse => MSE,
+    :mae => MAE,
+    :logloss => LogLoss,
+    :gaussian_mle => GaussianMLE,
+    :mlogloss => MLogLoss
+)
+
+mutable struct NeuroTreeRegressor <: MMI.Deterministic
+    loss::Symbol
+    nrounds::Int
+    lr::Float32
+    wd::Float32
+    batchsize::Int
+    actA::Symbol
+    depth::Int
+    ntrees::Int
+    hidden_size::Int
+    stack_size::Int
+    init_scale::Float32
+    MLE_tree_split::Bool
+    rng::Any
+end
+
+"""
+  NeuroTreeRegressor(; kwargs...)
+
+A model type for constructing a NeuroTreeRegressor, based on [NeuroTreeModels.jl](https://github.com/Evovest/NeuroTreeModels.jl), and implementing both an internal API and the MLJ model interface.
+
+# Hyper-parameters
+
+- `loss=:mse`:              Loss to be be minimized during training. One of:
+  - `:mse`
+  - `:mae`
+  - `:logloss`
+  - `:mlogloss`
+  - `:gaussian_mle`
+- `nrounds=10`:             Max number of rounds (epochs).
+- `lr=1.0f-2`:              Learning rate. Must be > 0. A lower `eta` results in slower learning, typically requiring a higher `nrounds`.   
+- `wd=0.f0`:                Weight decay applied to the gradients by the optimizer.
+- `batchsize=2048`:         Batch size.
+- `actA=:tanh`:             Activation function applied to each of input variable for determination of split node weight. Can be one of:
+    - `:tanh`
+    - `:identity`
+- `depth=6`:            Depth of a tree. Must be >= 1. A tree of depth 1 has 2 prediction leaf nodes. A complete tree of depth N contains `2^N` terminal leaves and `2^N - 1` split nodes.
+  Compute cost is proportional to `2^depth`. Typical optimal values are in the 3 to 5 range.
+- `ntrees=64`:              Number of trees (per stack).
+- `hidden_size=16`:         Size of hidden layers. Applicable only when `stack_size` > 1.
+- `stack_size=1`:           Number of stacked NeuroTree blocks.
+- `init_scale=1.0`:         Scaling factor applied to the predictions weights. Values in the `]0, 1]` short result in best performance. 
+- `MLE_tree_split=false`:   Whether independent models are buillt for each of the 2 parameters (mu, sigma) of the the `gaussian_mle` loss.
+- `rng=123`:                Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
+
+# Internal API
+
+Do `config = NeuroTreeRegressor()` to construct an instance with default hyper-parameters.
+Provide keyword arguments to override hyper-parameter defaults, as in NeuroTreeRegressor(loss=...).
+
+## Training model
+
+A model is trained using [`fit`](@ref):
+
+```julia
+m = fit(config, dtrain; feature_names, target_name, kwargs...)
+```
+
+## Inference
+
+Models act as a functor. returning predictions when called as a function with features as argument:
+
+```julia
+m(data)
+```
+
+# MLJ Interface
+
+From MLJ, the type can be imported using:
+
+```julia
+NeuroTreeRegressor = @load NeuroTreeRegressor pkg=NeuroTreeModels
+```
+
+Do `model = NeuroTreeRegressor()` to construct an instance with default hyper-parameters.
+Provide keyword arguments to override hyper-parameter defaults, as in `NeuroTreeRegressor(loss=...)`.
+
+## Training model
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    `mach = machine(model, X, y)` where
+- `X`: any table of input features (eg, a `DataFrame`) whose columns
+  each have one of the following element scitypes: `Continuous`,
+  `Count`, or `<:OrderedFactor`; check column scitypes with `schema(X)`
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `<:Continuous`; check the scitype
+  with `scitype(y)`
+
+Train the machine using `fit!(mach, rows=...)`.
+
+## Operations
+
+- `predict(mach, Xnew)`: return predictions of the target given
+  features `Xnew` having the same scitype as `X` above.
+
+## Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+  - `:fitresult`: The `NeuroTreeModel` object.
+
+## Report
+
+The fields of `report(mach)` are:
+  - `:features`: The names of the features encountered in training.
+
+# Examples
+
+## Internal API
+
+```julia
+using NeuroTreeModels, DataFrames
+config = NeuroTreeRegressor(depth=5, nrounds=10)
+nobs, nfeats = 1_000, 5
+dtrain = DataFrame(randn(nobs, nfeats), :auto)
+dtrain.y = rand(nobs)
+feature_names, target_name = names(dtrain, r"x"), "y"
+m = fit(config, dtrain; feature_names, target_name)
+p = m(dtrain)
+```
+
+## MLJ Interface
+
+```julia
+using MLJBase, NeuroTreeModels
+m = NeuroTreeRegressor(depth=5, nrounds=10)
+X, y = @load_boston
+mach = machine(m, X, y) |> fit!
+p = predict(mach, X)
+```
+"""
+function NeuroTreeRegressor(; kwargs...)
+
+    # defaults arguments
+    args = Dict{Symbol,Any}(
+        :loss => :mse,
+        :nrounds => 10,
+        :lr => 1.0f-2,
+        :wd => 0.0f0,
+        :batchsize => 2048,
+        :actA => :tanh,
+        :depth => 4,
+        :ntrees => 64,
+        :hidden_size => 1,
+        :stack_size => 1,
+        :init_scale => 0.1,
+        :MLE_tree_split => false,
+        :rng => 123,
+    )
+
+    args_ignored = setdiff(keys(kwargs), keys(args))
+    args_ignored_str = join(args_ignored, ", ")
+    length(args_ignored) > 0 &&
+        @info "Following $(length(args_ignored)) provided arguments will be ignored: $(args_ignored_str)."
+
+    args_default = setdiff(keys(args), keys(kwargs))
+    args_default_str = join(args_default, ", ")
+    length(args_default) > 0 &&
+        @info "Following $(length(args_default)) arguments were not provided and will be set to default: $(args_default_str)."
+
+    args_override = intersect(keys(args), keys(kwargs))
+    for arg in args_override
+        args[arg] = kwargs[arg]
+    end
+
+    loss = Symbol(args[:loss])
+    loss ∉ [:mse, :mae, :logloss, :gaussian_mle] && error("The provided kwarg `loss`: $loss is not supported.")
+
+    args[:rng] = mk_rng(args[:rng])
+
+    config = NeuroTreeRegressor(
+        args[:loss],
+        args[:nrounds],
+        Float32(args[:lr]),
+        Float32(args[:wd]),
+        args[:batchsize],
+        Symbol(args[:actA]),
+        args[:depth],
+        args[:ntrees],
+        args[:hidden_size],
+        args[:stack_size],
+        args[:init_scale],
+        args[:MLE_tree_split],
+        args[:rng]
+    )
+
+    return config
+end
+
+
+mutable struct NeuroTreeClassifier <: MMI.Probabilistic
+    loss::Symbol
+    nrounds::Int
+    lr::Float32
+    wd::Float32
+    batchsize::Int
+    actA::Symbol
+    depth::Int
+    ntrees::Int
+    hidden_size::Int
+    stack_size::Int
+    init_scale::Float32
+    MLE_tree_split::Bool
+    rng::Any
+end
+
+"""
+    NeuroTreeClassifier(; kwargs...)
+
+A model type for constructing a NeuroTreeClassifier, based on [NeuroTreeModels.jl](https://github.com/Evovest/NeuroTreeModels.jl), and implementing both an internal API and the MLJ model interface.
+
+# Hyper-parameters
+
+- `nrounds=10`:             Max number of rounds (epochs).
+- `lr=1.0f-2`:              Learning rate. Must be > 0. A lower `eta` results in slower learning, typically requiring a higher `nrounds`.   
+- `wd=0.f0`:                Weight decay applied to the gradients by the optimizer.
+- `batchsize=2048`:         Batch size.
+- `actA=:tanh`:             Activation function applied to each of input variable for determination of split node weight. Can be one of:
+    - `:tanh`
+    - `:identity`
+- `depth=6`:            Depth of a tree. Must be >= 1. A tree of depth 1 has 2 prediction leaf nodes. A complete tree of depth N contains `2^N` terminal leaves and `2^N - 1` split nodes.
+  Compute cost is proportional to `2^depth`. Typical optimal values are in the 3 to 5 range.
+- `ntrees=64`:              Number of trees (per stack).
+- `hidden_size=16`:         Size of hidden layers. Applicable only when `stack_size` > 1.
+- `stack_size=1`:           Number of stacked NeuroTree blocks.
+- `init_scale=1.0`:         Scaling factor applied to the predictions weights. Values in the `]0, 1]` short result in best performance. 
+- `MLE_tree_split=false`:   Whether independent models are buillt for each of the 2 parameters (mu, sigma) of the the `gaussian_mle` loss.
+- `rng=123`:                Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
+
+# Internal API
+
+Do `config = NeuroTreeClassifier()` to construct an instance with default hyper-parameters.
+Provide keyword arguments to override hyper-parameter defaults, as in NeuroTreeClassifier(loss=...).
+
+## Training model
+
+A model is trained using [`fit`](@ref):
+
+```julia
+m = fit(config, dtrain; feature_names, target_name, kwargs...)
+```
+
+## Inference
+
+Models act as a functor. returning predictions when called as a function with features as argument:
+
+```julia
+m(data)
+```
+
+# MLJ Interface
+
+From MLJ, the type can be imported using:
+
+```julia
+NeuroTreeClassifier = @load NeuroTreeClassifier pkg=NeuroTreeModels
+```
+
+Do `model = NeuroTreeClassifier()` to construct an instance with default hyper-parameters.
+Provide keyword arguments to override hyper-parameter defaults, as in `NeuroTreeClassifier(loss=...)`.
+
+## Training model
+
+In MLJ or MLJBase, bind an instance `model` to data with
+    `mach = machine(model, X, y)` where
+- `X`: any table of input features (eg, a `DataFrame`) whose columns
+  each have one of the following element scitypes: `Continuous`,
+  `Count`, or `<:OrderedFactor`; check column scitypes with `schema(X)`
+- `y`: is the target, which can be any `AbstractVector` whose element
+  scitype is `<:Continuous`; check the scitype
+  with `scitype(y)`
+
+Train the machine using `fit!(mach, rows=...)`.
+
+## Operations
+
+- `predict(mach, Xnew)`: return predictions of the target given
+  features `Xnew` having the same scitype as `X` above.
+
+## Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+  - `:fitresult`: The `NeuroTreeModel` object.
+
+## Report
+
+The fields of `report(mach)` are:
+  - `:features`: The names of the features encountered in training.
+
+# Examples
+
+## Internal API
+
+```julia
+using NeuroTreeModels, DataFrames, CategoricalArrays, Random 
+config = NeuroTreeClassifier(depth=5, nrounds=10)
+nobs, nfeats = 1_000, 5
+dtrain = DataFrame(randn(nobs, nfeats), :auto)
+dtrain.y = categorical(rand(1:2, nobs))
+feature_names, target_name = names(dtrain, r"x"), "y"
+m = fit(config, dtrain; feature_names, target_name)
+p = m(dtrain)
+```
+
+## MLJ Interface
+
+```julia
+using MLJBase, NeuroTreeModels
+m = NeuroTreeClassifier(depth=5, nrounds=10)
+X, y = @load_boston
+mach = machine(m, X, y) |> fit!
+p = predict(mach, X)
+```
+"""
+function NeuroTreeClassifier(; kwargs...)
+
+    # defaults arguments
+    args = Dict{Symbol,Any}(
+        :nrounds => 10,
+        :lr => 1.0f-2,
+        :wd => 0.0f0,
+        :batchsize => 2048,
+        :actA => :tanh,
+        :depth => 4,
+        :ntrees => 64,
+        :hidden_size => 1,
+        :stack_size => 1,
+        :init_scale => 0.1,
+        :MLE_tree_split => false,
+        :rng => 123,
+    )
+
+    args_ignored = setdiff(keys(kwargs), keys(args))
+    args_ignored_str = join(args_ignored, ", ")
+    length(args_ignored) > 0 &&
+        @info "Following $(length(args_ignored)) provided arguments will be ignored: $(args_ignored_str)."
+
+    args_default = setdiff(keys(args), keys(kwargs))
+    args_default_str = join(args_default, ", ")
+    length(args_default) > 0 &&
+        @info "Following $(length(args_default)) arguments were not provided and will be set to default: $(args_default_str)."
+
+    args_override = intersect(keys(args), keys(kwargs))
+    for arg in args_override
+        args[arg] = kwargs[arg]
+    end
+
+    args[:rng] = mk_rng(args[:rng])
+
+    config = NeuroTreeClassifier(
+        :mlogloss,
+        args[:nrounds],
+        Float32(args[:lr]),
+        Float32(args[:wd]),
+        args[:batchsize],
+        Symbol(args[:actA]),
+        args[:depth],
+        args[:ntrees],
+        args[:hidden_size],
+        args[:stack_size],
+        args[:init_scale],
+        args[:MLE_tree_split],
+        args[:rng],
+    )
+
+    return config
+end
+
+const NeuroTypes = Union{NeuroTreeRegressor,NeuroTreeClassifier}
+get_loss_type(config::NeuroTypes) = _loss_type_dict[config.loss]
diff --git a/src/loss.jl b/src/loss.jl
index 30a716a..24defdd 100644
--- a/src/loss.jl
+++ b/src/loss.jl
@@ -73,5 +73,4 @@ const _loss_fn_dict = Dict(
     :gaussian_mle => gaussian_mle,
 )
 
-get_loss_fn(config::NeuroTreeRegressor) = _loss_fn_dict[config.loss]
-
+get_loss_fn(config::NeuroTypes) = _loss_fn_dict[config.loss]
diff --git a/src/metrics.jl b/src/metrics.jl
index 4fecba1..d3aab24 100644
--- a/src/metrics.jl
+++ b/src/metrics.jl
@@ -151,4 +151,4 @@ is_maximise(::typeof(logloss)) = false
 is_maximise(::typeof(mlogloss)) = false
 is_maximise(::typeof(gaussian_mle)) = true
 
-end
\ No newline at end of file
+end
diff --git a/src/model.jl b/src/model.jl
index 27e0948..9894f63 100644
--- a/src/model.jl
+++ b/src/model.jl
@@ -1,216 +1,3 @@
-abstract type LossType end
-abstract type MSE <: LossType end
-abstract type MAE <: LossType end
-abstract type LogLoss <: LossType end
-abstract type MLogLoss <: LossType end
-abstract type GaussianMLE <: LossType end
-
-const _loss_type_dict = Dict(
-    :mse => MSE,
-    :mae => MAE,
-    :logloss => LogLoss,
-    :mlogloss => MLogLoss,
-    :gaussian_mle => GaussianMLE,
-)
-
-mutable struct NeuroTreeRegressor <: MMI.Deterministic
-    loss::Symbol
-    nrounds::Int
-    lr::Float32
-    wd::Float32
-    batchsize::Int
-    actA::Symbol
-    outsize::Int
-    depth::Int
-    ntrees::Int
-    hidden_size::Int
-    stack_size::Int
-    init_scale::Float32
-    MLE_tree_split::Bool
-    rng::Any
-    device::Symbol
-    gpuID::Int
-end
-
-"""
-  NeuroTreeRegressor(;kwargs...)
-
-A model type for constructing a NeuroTreeRegressor, based on [NeuroTreeModels.jl](https://github.com/Evovest/NeuroTreeModels.jl), and implementing both an internal API and the MLJ model interface.
-
-# Hyper-parameters
-
-- `loss=:mse`:              Loss to be be minimized during training. One of:
-  - `:mse`
-  - `:mae`
-  - `:logloss`
-  - `:mlogloss`
-  - `:gaussian_mle`
-- `nrounds=10`:             Max number of rounds (epochs).
-- `lr=1.0f-2`:              Learning rate. Must be > 0. A lower `eta` results in slower learning, typically requiring a higher `nrounds`.   
-- `wd=0.f0`:                Weight decay applied to the gradients by the optimizer.
-- `batchsize=2048`:         Batch size.
-- `actA=:tanh`:             Activation function applied to each of input variable for determination of split node weight. Can be one of:
-    - `:tanh`
-    - `:identity`
-- `outsize=1`:              Number of predictions returned by the model. Typically only used for classification tasks and set to the number of target levels / classes.
-- `depth=6`:            Depth of a tree. Must be >= 1. A tree of depth 1 has 2 prediction leaf nodes. A complete tree of depth N contains `2^N` terminal leaves and `2^N - 1` split nodes.
-  Compute cost is proportional to `2^depth`. Typical optimal values are in the 3 to 5 range.
-- `ntrees=64`:              Number of trees (per stack).
-- `hidden_size=16`:         Size of hidden layers. Applicable only when `stack_size` > 1.
-- `stack_size=1`:           Number of stacked NeuroTree blocks.
-- `init_scale=1.0`:         Scaling factor applied to the predictions weights. Values in the `]0, 1]` short result in best performance. 
-- `MLE_tree_split=false`:   Whether independent models are buillt for each of the 2 parameters (mu, sigma) of the the `gaussian_mle` loss.
-- `rng=123`:                Either an integer used as a seed to the random number generator or an actual random number generator (`::Random.AbstractRNG`).
-- `device=:cpu`:            Device to use. Either `:cpu` or `:gpu` (recommended as it improves significantly the training speed). 
-- `gpuID=0`:                ID of the GPU to use for training.
-
-# Internal API
-
-Do `config = NeuroTreeRegressor()` to construct an instance with default hyper-parameters.
-Provide keyword arguments to override hyper-parameter defaults, as in NeuroTreeRegressor(loss=...).
-
-## Training model
-
-A model is trained using [`fit`](@ref):
-
-```julia
-m = fit(config, dtrain; feature_names, target_name, kwargs...)
-```
-
-## Inference
-
-Models act as a functor. returning predictions when called as a function with features as argument:
-
-```julia
-m(data)
-```
-
-# MLJ Interface
-
-From MLJ, the type can be imported using:
-
-```julia
-NeuroTreeRegressor = @load NeuroTreeRegressor pkg=NeuroTreeModels
-```
-
-Do `model = NeuroTreeRegressor()` to construct an instance with default hyper-parameters.
-Provide keyword arguments to override hyper-parameter defaults, as in `NeuroTreeRegressor(loss=...)`.
-
-## Training model
-
-In MLJ or MLJBase, bind an instance `model` to data with
-    `mach = machine(model, X, y)` where
-- `X`: any table of input features (eg, a `DataFrame`) whose columns
-  each have one of the following element scitypes: `Continuous`,
-  `Count`, or `<:OrderedFactor`; check column scitypes with `schema(X)`
-- `y`: is the target, which can be any `AbstractVector` whose element
-  scitype is `<:Continuous`; check the scitype
-  with `scitype(y)`
-
-Train the machine using `fit!(mach, rows=...)`.
-
-## Operations
-
-- `predict(mach, Xnew)`: return predictions of the target given
-  features `Xnew` having the same scitype as `X` above.
-
-## Fitted parameters
-
-The fields of `fitted_params(mach)` are:
-  - `:fitresult`: The `NeuroTreeModel` object.
-
-## Report
-
-The fields of `report(mach)` are:
-  - `:features`: The names of the features encountered in training.
-
-# Examples
-
-## Internal API
-
-```julia
-using NeuroTreeModels, DataFrames
-config = NeuroTreeRegressor(depth=5, nrounds=10)
-nobs, nfeats = 1_000, 5
-dtrain = DataFrame(randn(nobs, nfeats), :auto)
-dtrain.y = rand(nobs)
-feature_names, target_name = names(dtrain, r"x"), "y"
-m = fit(config, dtrain; feature_names, target_name)
-p = m(dtrain)
-```
-
-## MLJ Interface
-
-```julia
-using MLJBase, NeuroTreeModels
-m = NeuroTreeRegressor(depth=5, nrounds=10)
-X, y = @load_boston
-mach = machine(m, X, y) |> fit!
-p = predict(mach, X)
-```
-"""
-function NeuroTreeRegressor(; kwargs...)
-
-    # defaults arguments
-    args = Dict{Symbol,Any}(
-        :loss => :mse,
-        :nrounds => 10,
-        :lr => 1.0f-2,
-        :wd => 0.0f0,
-        :batchsize => 2048,
-        :actA => :tanh,
-        :outsize => 1,
-        :depth => 4,
-        :ntrees => 64,
-        :hidden_size => 1,
-        :stack_size => 1,
-        :init_scale => 0.1,
-        :MLE_tree_split => false,
-        :rng => 123,
-        :device => :cpu,
-        :gpuID => 0,
-    )
-
-    args_ignored = setdiff(keys(kwargs), keys(args))
-    args_ignored_str = join(args_ignored, ", ")
-    length(args_ignored) > 0 &&
-        @info "Following $(length(args_ignored)) provided arguments will be ignored: $(args_ignored_str)."
-
-    args_default = setdiff(keys(args), keys(kwargs))
-    args_default_str = join(args_default, ", ")
-    length(args_default) > 0 &&
-        @info "Following $(length(args_default)) arguments were not provided and will be set to default: $(args_default_str)."
-
-    args_override = intersect(keys(args), keys(kwargs))
-    for arg in args_override
-        args[arg] = kwargs[arg]
-    end
-
-    args[:rng] = mk_rng(args[:rng])
-
-    config = NeuroTreeRegressor(
-        Symbol(args[:loss]),
-        args[:nrounds],
-        Float32(args[:lr]),
-        Float32(args[:wd]),
-        args[:batchsize],
-        Symbol(args[:actA]),
-        args[:outsize],
-        args[:depth],
-        args[:ntrees],
-        args[:hidden_size],
-        args[:stack_size],
-        args[:init_scale],
-        args[:MLE_tree_split],
-        args[:rng],
-        Symbol(args[:device]),
-        args[:gpuID],
-    )
-
-    return config
-end
-
-get_loss_type(config::NeuroTreeRegressor) = _loss_type_dict[config.loss]
 
 struct NeuroTree{W,B,P}
     w::W
@@ -245,7 +32,8 @@ end
 dot_prod_agg(lw, p) = dropdims(sum(reshape(lw, 1, size(lw)...) .* p, dims=(2, 3)), dims=(2, 3))
 
 """
-    NeuroTree
+    NeuroTree(; ins, outs, depth=4, ntrees=64, actA=identity, init_scale=1.0)
+    NeuroTree((ins, outs)::Pair{<:Integer,<:Integer}; depth=4, ntrees=64, actA=identity, init_scale=1.0)
 
 Initialization of a NeuroTree.
 """
@@ -327,7 +115,6 @@ end
 #     return p
 # end
 
-
 """
     NeuroTreeModel
 A NeuroTreeModel is made of a collection of Tree, either regular `NeuroTree` or `StackTree`.
@@ -367,21 +154,21 @@ const _act_dict = Dict(
     :hardsigmoid => hardsigmoid
 )
 
-function get_model_chain(L; config, nfeats)
+function get_model_chain(L; config, nfeats, outsize)
 
     if L <: GaussianMLE && config.MLE_tree_split
         chain = Chain(
             BatchNorm(nfeats),
             Parallel(
                 vcat,
-                StackTree(nfeats => config.outsize;
+                StackTree(nfeats => outsize;
                     depth=config.depth,
                     ntrees=config.ntrees,
                     stack_size=config.stack_size,
                     hidden_size=config.hidden_size,
                     actA=_act_dict[config.actA],
                     init_scale=config.init_scale),
-                StackTree(nfeats => config.outsize;
+                StackTree(nfeats => outsize;
                     depth=config.depth,
                     ntrees=config.ntrees,
                     stack_size=config.stack_size,
@@ -391,7 +178,7 @@ function get_model_chain(L; config, nfeats)
             )
         )
     else
-        outsize = L <: GaussianMLE ? 2 * config.outsize : config.outsize
+        outsize = L <: GaussianMLE ? 2 * outsize : outsize
         chain = Chain(
             BatchNorm(nfeats),
             StackTree(nfeats => outsize;
diff --git a/test/MLJ.jl b/test/MLJ.jl
index 3567ebb..78d01ca 100644
--- a/test/MLJ.jl
+++ b/test/MLJ.jl
@@ -14,6 +14,27 @@ sigmoid(x::AbstractVector) = sigmoid.(x)
         )
         @test isempty(failures)
     end
+    @testset "NeuroTreeClassifier" begin
+
+        failures, summary = MLJTestInterface.test(
+            [NeuroTreeClassifier],
+            MLJTestInterface.make_binary()...;
+            mod=@__MODULE__,
+            verbosity=0, # bump to debug
+            throw=true # set to true to debug
+        )
+        @test isempty(failures)
+
+        failures, summary = MLJTestInterface.test(
+            [NeuroTreeClassifier],
+            MLJTestInterface.make_multiclass()...;
+            mod=@__MODULE__,
+            verbosity=0, # bump to debug
+            throw=true # set to true to debug
+        )
+        @test isempty(failures)
+
+    end
 end
 
 ##################################################
@@ -70,4 +91,39 @@ end
     predict(mach, X)
 end
 
-MLJTestInterface.make_regression()
+@testset "MLJ - classification" begin
+    X, y = @load_crabs
+
+    tree_model = NeuroTreeClassifier(
+        depth=4,
+        lr=0.1,
+        nrounds=20,
+        batchsize=64
+    )
+
+    # @load EvoTreeRegressor
+    mach = machine(tree_model, X, y)
+    train, test = partition(eachindex(y), 0.7, shuffle=true) # 70:30 split
+    fit!(mach, rows=train, verbosity=1)
+
+    mach.model.nrounds += 50
+    fit!(mach, rows=train, verbosity=1)
+
+    pred_train = predict(mach, selectrows(X, train))
+    pred_train_mode = predict_mode(mach, selectrows(X, train))
+    sum(pred_train_mode .== y[train]) / length(y[train])
+
+    pred_test = predict(mach, selectrows(X, test))
+    pred_test_mode = predict_mode(mach, selectrows(X, test))
+    pred_test_mode = predict_mode(mach, selectrows(X, test))
+    sum(pred_test_mode .== y[test]) / length(y[test])
+end
+
+@testset "MLJ - support for ordered factor predictions" begin
+    X = (; x=rand(10))
+    y = coerce(rand("ab", 10), OrderedFactor)
+    model = NeuroTreeClassifier()
+    mach = machine(model, X, y) |> fit!
+    yhat = predict(mach, X)
+    @assert isordered(yhat)
+end
diff --git a/test/core.jl b/test/core.jl
index 9f8bf9f..34ded19 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -1,7 +1,6 @@
-@testset "Regression test" begin
+@testset "Core - internals test" begin
 
     config = NeuroTreeRegressor(
-        device=:cpu,
         loss=:mse,
         actA=:identity,
         init_scale=1.0,
@@ -10,7 +9,6 @@
         ntrees=32,
         stack_size=1,
         hidden_size=1,
-        outsize=1,
         batchsize=2048,
         lr=1e-3,
     )
@@ -21,11 +19,12 @@
     x = rand(Float32, nfeats, nobs)
     feature_names = "var_" .* string.(1:nobs)
 
+    outsize = 1
     loss = NeuroTreeModels.get_loss_fn(config)
     L = NeuroTreeModels.get_loss_type(config)
-    chain = NeuroTreeModels.get_model_chain(L; config, nfeats)
+    chain = NeuroTreeModels.get_model_chain(L; config, nfeats, outsize)
     info = Dict(
-        :device => config.device,
+        :device => :cpu,
         :nrounds => 0,
         :feature_names => feature_names
     )
@@ -33,12 +32,52 @@
 
 end
 
+@testset "Core - Regression" begin
+
+    Random.seed!(123)
+    X, y = rand(1000, 10), randn(1000)
+    df = DataFrame(X, :auto)
+    df[!, :y] = y
+    target_name = "y"
+    feature_names = setdiff(names(df), [target_name])
+
+    train_ratio = 0.8
+    train_indices = randperm(nrow(df))[1:Int(train_ratio * nrow(df))]
+
+    dtrain = df[train_indices, :]
+    deval = df[setdiff(1:nrow(df), train_indices), :]
+
+    config = NeuroTreeRegressor(
+        loss=:mse,
+        nrounds=20,
+        depth=3,
+        lr=1e-1,
+    )
+
+    m = NeuroTreeModels.fit(
+        config,
+        dtrain;
+        target_name,
+        feature_names
+    )
+
+    m = NeuroTreeModels.fit(
+        config,
+        dtrain;
+        target_name,
+        feature_names,
+        deval,
+        metric=:mse
+    )
+
+end
+
 @testset "Classification test" begin
 
     Random.seed!(123)
     X, y = @load_crabs
     df = DataFrame(X)
-    df[!, :class] .= levelcode.(y)
+    df[!, :class] = y
     target_name = "class"
     feature_names = setdiff(names(df), [target_name])
 
@@ -48,13 +87,11 @@ end
     dtrain = df[train_indices, :]
     deval = df[setdiff(1:nrow(df), train_indices), :]
 
-    config = NeuroTreeRegressor(
-        device=:cpu,
-        loss=:mlogloss,
+    config = NeuroTreeClassifier(
         nrounds=100,
-        outsize=3,
-        depth=3,
-        lr=1e-1,
+        depth=4,
+        lr=3e-2,
+        batchsize=64
     )
 
     m = NeuroTreeModels.fit(
@@ -64,14 +101,15 @@ end
         target_name,
         feature_names,
         metric=:mlogloss,
-        print_every_n=10,
-        early_stopping_rounds=2,
+        early_stopping_rounds=10,
+        # print_every_n=10,
+        device=:cpu
     )
 
     # Predictions depend on the number of samples in the dataset
     ptrain = [argmax(x) for x in eachrow(m(dtrain))]
     peval = [argmax(x) for x in eachrow(m(deval))]
-    @test mean(ptrain .== dtrain.class) > 0.95
-    @test mean(peval .== deval.class) > 0.95
+    @test mean(ptrain .== levelcode.(dtrain.class)) > 0.95
+    @test mean(peval .== levelcode.(deval.class)) > 0.95
 
-end
\ No newline at end of file
+end

From e73cc6c1617f3c65cf3d8231af34e68de8f95fed Mon Sep 17 00:00:00 2001
From: jeremie <jeremie.desgagne.bouchard@gmail.com>
Date: Sun, 21 Apr 2024 13:02:21 -0400
Subject: [PATCH 4/6] clean docs

---
 docs/src/design.md                        |  2 +-
 docs/src/tutorials-classification-iris.md | 21 +++++++++++----------
 docs/src/tutorials-logistic-titanic.md    |  2 +-
 docs/src/tutorials-regression-boston.md   |  2 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/docs/src/design.md b/docs/src/design.md
index 9a025a6..3c3a5d0 100644
--- a/docs/src/design.md
+++ b/docs/src/design.md
@@ -42,7 +42,7 @@ The following illustrate how a basic decision tree is represented as a single di
 
 To illustrate how a NeuroTree derives the soft decision probability (referred to `NW1 - NW3` in the above figure), we first break down how a traditional tree split condition is derived from 2 underlying decisions:
 
-1. *Selection of the feature on which to perform the condition*.
+1. *Selection of the feature on which to apply the condition*.
 Such selection can be represented as the application of a binary mask where all elements are set to `false` except for that single selected feature where it's set to `true`.  
 
 2. *Selection of the condition's threshold value*.
diff --git a/docs/src/tutorials-classification-iris.md b/docs/src/tutorials-classification-iris.md
index 474c5b3..fc06cd4 100644
--- a/docs/src/tutorials-classification-iris.md
+++ b/docs/src/tutorials-classification-iris.md
@@ -19,12 +19,12 @@ Random.seed!(123)
 ## Preprocessing
 
 Before we can train our model, we need to preprocess the dataset. We will convert the class variable, which specifies the type of iris flower, into a categorical variable.
+For classification tasks, it's a requirement that `eltype(target_var)<:CategoricalValue`.
 
 ```julia
 df = MLDatasets.Iris().dataframe
 
 df[!, :class] = categorical(df[!, :class])
-df[!, :class] .= levelcode.(df[!, :class])
 target_name = "class"
 feature_names = setdiff(names(df), [target_name])
 
@@ -37,17 +37,15 @@ deval = df[setdiff(1:nrow(df), train_indices), :]
 
 ## Training
 
-Now we are ready to train our model. We first define a model configuration using the [`NeuroTreeRegressor`](@ref) model constructor. 
+Now we are ready to train our model. We first define a model configuration using the [`NeuroTreeClassifier`](@ref) model constructor. 
 Then, we use [`NeuroTreeModels.fit`](@ref) to train a boosted tree model. We pass the optional `deval` argument to enable the usage of early stopping. 
 
 ```julia
-config = NeuroTreeRegressor(
-    device=:cpu,
-    loss=:mlogloss,
+config = NeuroTreeClassifier(
     nrounds=400,
-    outsize=3,
     depth=4,
-    lr=2e-2,
+    lr=5e-2,
+    batchsize=60,
 )
 
 m = NeuroTreeModels.fit(
@@ -71,10 +69,13 @@ p_train = m(dtrain)
 p_eval = m(deval)
 ```
 
+Note that the raw predictions for a classification task a `Matrix` where each row is the vector of probability for each of the target levels.
+It can be converted into a predicted class index using `NeuroTreeModels.onecold` (imported from Flux), or `[argmax(p) for p in eachrow(p_train)]`.
+
 ```julia-repl
-julia> mean(dtrain[!, target_name] .== NeuroTreeModels.onecold(p_train'))
-0.9833333333333333
+julia> mean(levelcode.(dtrain[!, target_name]) .== NeuroTreeModels.onecold(p_train'))
+0.975
 
-julia> mean(deval[!, target_name] .== NeuroTreeModels.onecold(p_eval'))
+julia> mean(levelcode.(deval[!, target_name]) .== NeuroTreeModels.onecold(p_eval'))
 1.0
 ```
\ No newline at end of file
diff --git a/docs/src/tutorials-logistic-titanic.md b/docs/src/tutorials-logistic-titanic.md
index 1b5d7f3..4e83c79 100644
--- a/docs/src/tutorials-logistic-titanic.md
+++ b/docs/src/tutorials-logistic-titanic.md
@@ -64,7 +64,6 @@ Then, we use [`NeuroTreeModels.fit`](@ref) to train a boosted tree model. We pas
 
 ```julia
 config = NeuroTreeRegressor(
-    device=:cpu,
     loss=:logloss,
     nrounds=400,
     depth=4,
@@ -80,6 +79,7 @@ m = NeuroTreeModels.fit(
     metric=:logloss,
     print_every_n=10,
     early_stopping_rounds=2,
+    device=:cpu
 )
 ```
 
diff --git a/docs/src/tutorials-regression-boston.md b/docs/src/tutorials-regression-boston.md
index 2a9a2d0..5275c68 100644
--- a/docs/src/tutorials-regression-boston.md
+++ b/docs/src/tutorials-regression-boston.md
@@ -44,7 +44,6 @@ Then, we use [`NeuroTreeModels.fit`](@ref) to train a boosted tree model. We pas
 
 ```julia
 config = NeuroTreeRegressor(
-    device=:cpu,
     loss=:mse,
     nrounds=400,
     depth=5,
@@ -60,6 +59,7 @@ m = NeuroTreeModels.fit(
     metric=:mse,
     print_every_n=10,
     early_stopping_rounds=2,
+    device=:cpu
 )
 ```
 

From 4c87703565b52acf536c648a807c40ac495ad2c9 Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Sun, 21 Apr 2024 13:06:46 -0400
Subject: [PATCH 5/6] tweedie

---
 README.md                    |  2 +-
 benchmarks/MSRank-tweedie.jl | 89 ------------------------------------
 benchmarks/MSRank.jl         |  6 +--
 benchmarks/YEAR-tweedie.jl   | 82 +++++++++++++++++++++++++++++++++
 src/infer.jl                 | 10 ++++
 src/loss.jl                  |  6 +--
 src/metrics.jl               |  6 +--
 7 files changed, 101 insertions(+), 100 deletions(-)
 delete mode 100644 benchmarks/MSRank-tweedie.jl
 create mode 100644 benchmarks/YEAR-tweedie.jl

diff --git a/README.md b/README.md
index 8a6234e..08d007b 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
 
 ## Installation
 
-```julia-repl
+```julia
 ] add NeuroTreeModels
 ```
 
diff --git a/benchmarks/MSRank-tweedie.jl b/benchmarks/MSRank-tweedie.jl
deleted file mode 100644
index 18dd3fd..0000000
--- a/benchmarks/MSRank-tweedie.jl
+++ /dev/null
@@ -1,89 +0,0 @@
-using Revise
-using Random
-using CSV
-using DataFrames
-using StatsBase
-using Statistics: mean, std
-using NeuroTreeModels
-using Solage: Connectors
-using ReadLIBSVM
-using AWS: AWSCredentials, AWSConfig, @service
-
-# https://www.microsoft.com/en-us/research/project/mslr/
-
-@service S3
-aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
-aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
-bucket = "jeremiedb"
-
-# initial prep
-function read_libsvm_aws(file::String; has_query=false, aws_config=AWSConfig())
-    raw = S3.get_object("jeremiedb", file, Dict("response-content-type" => "application/octet-stream"); aws_config)
-    return read_libsvm(raw; has_query)
-end
-
-@time train_raw = read_libsvm_aws("share/data/msrank/train.txt"; has_query=true, aws_config);
-@time eval_raw = read_libsvm_aws("share/data/msrank/vali.txt"; has_query=true, aws_config);
-@time test_raw = read_libsvm_aws("share/data/msrank/test.txt"; has_query=true, aws_config);
-
-dtrain = DataFrame(train_raw[:x], :auto)
-dtrain.y_raw = train_raw[:y]
-dtrain.y = dtrain.y_raw ./ 4
-dtrain.q = train_raw[:q]
-
-deval = DataFrame(eval_raw[:x], :auto)
-deval.y_raw = eval_raw[:y]
-deval.y = deval.y_raw ./ 4
-deval.q = eval_raw[:q]
-
-dtest = DataFrame(test_raw[:x], :auto)
-dtest.y_raw = test_raw[:y]
-dtest.y = dtest.y_raw ./ 4
-dtest.q = test_raw[:q]
-
-feature_names = setdiff(names(dtrain), ["y", "y_raw", "q"])
-target_name = "y_raw"
-
-function percent_rank(x::AbstractVector{T}) where {T}
-    return tiedrank(x) / (length(x) + 1)
-end
-
-transform!(dtrain, feature_names .=> percent_rank .=> feature_names)
-transform!(deval, feature_names .=> percent_rank .=> feature_names)
-transform!(dtest, feature_names .=> percent_rank .=> feature_names)
-
-config = NeuroTreeRegressor(
-    device=:gpu,
-    loss=:tweedie_deviance,
-    nrounds=2,
-    actA=:tanh,
-    outsize=1,
-    depth=4,
-    ntrees=64,
-    stack_size=2,
-    hidden_size=16,
-    batchsize=4096,
-    lr=3e-4,
-)
-
-@time m, logger = NeuroTreeModels.fit(
-    config,
-    dtrain;
-    deval,
-    target_name,
-    feature_names,
-    print_every_n=1,
-    early_stopping_rounds=3,
-    metric=:tweedie_deviance,
-    return_logger=true
-);
-
-dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
-p_eval = m(dinfer_eval);
-mse_eval = mean((p_eval .- deval.y_raw) .^ 2)
-@info "MSE - deval" mse_eval
-
-dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
-p_test = m(dinfer_test);
-mse_test = mean((p_test .- dtest.y_raw) .^ 2)
-@info "MSE - dtest" mse_test
diff --git a/benchmarks/MSRank.jl b/benchmarks/MSRank.jl
index 58fa386..572cae1 100644
--- a/benchmarks/MSRank.jl
+++ b/benchmarks/MSRank.jl
@@ -78,12 +78,10 @@ config = NeuroTreeRegressor(
     return_logger=true
 );
 
-dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
-p_eval = m(dinfer_eval);
+p_eval = m(deval);
 mse_eval = mean((p_eval .- deval.y_raw) .^ 2)
 @info "MSE - deval" mse_eval
 
-dinfer_test = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
-p_test = m(dinfer_test);
+p_test = m(dtest);
 mse_test = mean((p_test .- dtest.y_raw) .^ 2)
 @info "MSE - dtest" mse_test
diff --git a/benchmarks/YEAR-tweedie.jl b/benchmarks/YEAR-tweedie.jl
new file mode 100644
index 0000000..61c8009
--- /dev/null
+++ b/benchmarks/YEAR-tweedie.jl
@@ -0,0 +1,82 @@
+#####################################################################
+# WIP: need to adapt the fit! function to support normal DataFrame (not just GroupedOne)
+#    Have dataloader adapted to DF vs GDF (both at fit init and callback init)
+#####################################################################
+
+using Revise
+using Random
+using CSV
+using DataFrames
+using StatsBase
+using Statistics: mean, std
+using NeuroTreeModels
+
+using AWS: AWSCredentials, AWSConfig, @service
+@service S3
+aws_creds = AWSCredentials(ENV["AWS_ACCESS_KEY_ID_JDB"], ENV["AWS_SECRET_ACCESS_KEY_JDB"])
+aws_config = AWSConfig(; creds=aws_creds, region="ca-central-1")
+
+path = "share/data/year/year.csv"
+raw = S3.get_object("jeremiedb", path, Dict("response-content-type" => "application/octet-stream"); aws_config)
+df = DataFrame(CSV.File(raw, header=false))
+df_tot = copy(df)
+
+path = "share/data/year/year-train-idx.txt"
+raw = S3.get_object("jeremiedb", path, Dict("response-content-type" => "application/octet-stream"); aws_config)
+train_idx = DataFrame(CSV.File(raw, header=false))[:, 1] .+ 1
+
+path = "share/data/year/year-eval-idx.txt"
+raw = S3.get_object("jeremiedb", path, Dict("response-content-type" => "application/octet-stream"); aws_config)
+eval_idx = DataFrame(CSV.File(raw, header=false))[:, 1] .+ 1
+
+transform!(df_tot, "Column1" => identity => "y_raw")
+transform!(df_tot, "y_raw" => (x -> (x .- minimum(x)) ./ std(x)) => "y_norm")
+select!(df_tot, Not("Column1"))
+feature_names = setdiff(names(df_tot), ["y_raw", "y_norm"])
+target_name = "y_norm"
+
+function percent_rank(x::AbstractVector{T}) where {T}
+    return tiedrank(x) / (length(x) + 1)
+end
+
+transform!(df_tot, feature_names .=> percent_rank .=> feature_names)
+
+dtrain = df_tot[train_idx, :];
+deval = df_tot[eval_idx, :];
+dtest = df_tot[(end-51630+1):end, :];
+
+config = NeuroTreeRegressor(
+    device=:gpu,
+    loss=:tweedie_deviance,
+    actA=:tanh,
+    nrounds=200,
+    outsize=1,
+    depth=4,
+    ntrees=64,
+    hidden_size=8,
+    stack_size=1,
+    init_scale=1.0,
+    MLE_tree_split=true,
+    batchsize=2048,
+    lr=1e-3,
+)
+
+@time m, logger = NeuroTreeModels.fit(
+    config,
+    dtrain;
+    deval,
+    target_name,
+    feature_names,
+    print_every_n=5,
+    early_stopping_rounds=2,
+    metric=:tweedie_deviance,
+    return_logger=true
+);
+
+p_eval = m(deval);
+mse_eval = mean((p_eval .- deval.y_norm) .^ 2)
+@info "MSE raw - deval" mse_eval
+
+p_test = m(dtest);
+mse_test = mean((p_test .- dtest.y_norm) .^ 2) * std(df_tot.y_raw)^2
+@info "MSE - dtest" mse_test
diff --git a/src/infer.jl b/src/infer.jl
index 06d2e29..72a3b73 100644
--- a/src/infer.jl
+++ b/src/infer.jl
@@ -54,3 +54,13 @@ function infer(m::NeuroTreeModel{<:GaussianMLE}, data::DL)
     p[:, 2] .= exp.(p[:, 2]) # reproject log(σ) into σ 
     return p
 end
+
+function infer(m::NeuroTreeModel{L}, data::DL) where {L<:Union{TweedieDeviance}}
+    preds = Vector{Float32}[]
+    for x in data
+        push!(preds, Vector(m(x)))
+    end
+    p = vcat(preds...)
+    p .= exp.(p)
+    return p
+end
diff --git a/src/loss.jl b/src/loss.jl
index f87c1e6..13bcc7a 100644
--- a/src/loss.jl
+++ b/src/loss.jl
@@ -56,17 +56,17 @@ end
 function mlogloss(m, x, y)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    mean(-sum(onehotbatch(y, 1:k) .* p; dims=1))
+    mean(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1))
 end
 function mlogloss(m, x, y, w)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    sum(-sum(onehotbatch(y, 1:k) .* p; dims=1) .* w) / sum(w)
+    sum(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1) .* w) / sum(w)
 end
 function mlogloss(m, x, y, w, offset)
     p = logsoftmax(m(x) .+ offset; dims=1)
     k = size(p, 1)
-    sum(-sum(onehotbatch(y, 1:k) .* p; dims=1) .* w) / sum(w)
+    sum(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1) .* w) / sum(w)
 end
 
 gaussian_mle_loss(μ::AbstractVector{T}, σ::AbstractVector{T}, y::AbstractVector{T}) where {T} =
diff --git a/src/metrics.jl b/src/metrics.jl
index 60c59d8..cad5786 100644
--- a/src/metrics.jl
+++ b/src/metrics.jl
@@ -100,21 +100,21 @@ end
 function mlogloss(m, x, y; agg=mean)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    raw = dropdims(-sum(onehotbatch(y, 1:k) .* p; dims=1); dims=1)
+    raw = dropdims(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1); dims=1)
     metric = agg(raw)
     return metric
 end
 function mlogloss(m, x, y, w; agg=mean)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    raw = dropdims(-sum(onehotbatch(y, 1:k) .* p; dims=1); dims=1)
+    raw = dropdims(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1); dims=1)
     metric = agg(raw .* w)
     return metric
 end
 function mlogloss(m, x, y, w, offset; agg=mean)
     p = logsoftmax(m(x) .+ offset; dims=1)
     k = size(p, 1)
-    raw = dropdims(-sum(onehotbatch(y, 1:k) .* p; dims=1); dims=1)
+    raw = dropdims(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1); dims=1)
     metric = agg(raw .* w)
     return metric
 end

From f89699872159a57ad3d769b4d2921d22178947d1 Mon Sep 17 00:00:00 2001
From: "jeremie.desgagne.bouchard" <jeremie.desgagne.bouchard@gmail.com>
Date: Sun, 21 Apr 2024 13:39:15 -0400
Subject: [PATCH 6/6] cleanup

---
 benchmarks/Higgs-logloss.jl |   5 +-
 benchmarks/MSRank.jl        |   6 +-
 benchmarks/YEAR-gaussian.jl |   6 +-
 benchmarks/YEAR-mse.jl      |   6 +-
 benchmarks/YEAR-tweedie.jl  |  14 +-
 benchmarks/Yahoo-LTRC.jl    |   6 +-
 benchmarks/aicrowd-test.jl  |   4 +-
 src/learners.jl             | 272 ++++++++++++++++++------------------
 src/loss.jl                 |   6 +-
 src/metrics.jl              |   6 +-
 10 files changed, 158 insertions(+), 173 deletions(-)

diff --git a/benchmarks/Higgs-logloss.jl b/benchmarks/Higgs-logloss.jl
index 40aa13c..7690807 100644
--- a/benchmarks/Higgs-logloss.jl
+++ b/benchmarks/Higgs-logloss.jl
@@ -31,7 +31,6 @@ deval = df_tot[end-1_000_000+1:end-500_000, :];
 dtest = df_tot[end-500_000+1:end, :];
 
 config = NeuroTreeRegressor(
-    device=:gpu,
     loss=:logloss,
     nrounds=200,
     scaler=true,
@@ -44,7 +43,7 @@ config = NeuroTreeRegressor(
     batchsize=8092,
 )
 
-@time m, logger = NeuroTreeModels.fit(
+@time m = NeuroTreeModels.fit(
     config,
     dtrain;
     deval,
@@ -53,7 +52,7 @@ config = NeuroTreeRegressor(
     print_every_n=1,
     early_stopping_rounds=2,
     metric=:logloss,
-    return_logger=true
+    device=:gpu,
 );
 
 dinfer_eval = NeuroTreeModels.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
diff --git a/benchmarks/MSRank.jl b/benchmarks/MSRank.jl
index 572cae1..dc5248f 100644
--- a/benchmarks/MSRank.jl
+++ b/benchmarks/MSRank.jl
@@ -53,11 +53,9 @@ transform!(deval, feature_names .=> percent_rank .=> feature_names)
 transform!(dtest, feature_names .=> percent_rank .=> feature_names)
 
 config = NeuroTreeRegressor(
-    device=:gpu,
     loss=:mse,
     nrounds=2,
     actA=:tanh,
-    outsize=1,
     depth=4,
     ntrees=64,
     stack_size=2,
@@ -66,7 +64,7 @@ config = NeuroTreeRegressor(
     lr=3e-4,
 )
 
-@time m, logger = NeuroTreeModels.fit(
+@time m = NeuroTreeModels.fit(
     config,
     dtrain;
     deval,
@@ -75,7 +73,7 @@ config = NeuroTreeRegressor(
     print_every_n=1,
     early_stopping_rounds=3,
     metric=:mse,
-    return_logger=true
+    device=:gpu,
 );
 
 p_eval = m(deval);
diff --git a/benchmarks/YEAR-gaussian.jl b/benchmarks/YEAR-gaussian.jl
index 6f7a55e..6276faa 100644
--- a/benchmarks/YEAR-gaussian.jl
+++ b/benchmarks/YEAR-gaussian.jl
@@ -47,11 +47,9 @@ deval = df_tot[eval_idx, :];
 dtest = df_tot[(end-51630+1):end, :];
 
 config = NeuroTreeRegressor(
-    device=:gpu,
     loss=:gaussian_mle,
     actA=:identity,
     nrounds=200,
-    outsize=1,
     depth=4,
     ntrees=32,
     hidden_size=8,
@@ -62,7 +60,7 @@ config = NeuroTreeRegressor(
     lr=1e-3,
 )
 
-@time m, logger = NeuroTreeModels.fit(
+@time m = NeuroTreeModels.fit(
     config,
     dtrain;
     deval,
@@ -71,7 +69,7 @@ config = NeuroTreeRegressor(
     print_every_n=5,
     early_stopping_rounds=2,
     metric=:gaussian_mle,
-    return_logger=true
+    device=:gpu
 );
 
 # dinfer_eval = NeuroTrees.get_df_loader_infer(deval; feature_names, batchsize=config.batchsize, device=config.device);
diff --git a/benchmarks/YEAR-mse.jl b/benchmarks/YEAR-mse.jl
index a7d9d67..76e5e83 100644
--- a/benchmarks/YEAR-mse.jl
+++ b/benchmarks/YEAR-mse.jl
@@ -42,7 +42,6 @@ deval = df_tot[eval_idx, :];
 dtest = df_tot[(end-51630+1):end, :];
 
 config = NeuroTreeRegressor(
-    device=:gpu,
     loss=:mse,
     actA=:identity,
     init_scale=1.0,
@@ -51,12 +50,11 @@ config = NeuroTreeRegressor(
     ntrees=32,
     stack_size=1,
     hidden_size=1,
-    outsize=1,
     batchsize=2048,
     lr=3e-4,
 )
 
-@time m, logger = NeuroTreeModels.fit(
+@time m = NeuroTreeModels.fit(
     config,
     dtrain;
     deval,
@@ -65,7 +63,7 @@ config = NeuroTreeRegressor(
     print_every_n=5,
     early_stopping_rounds=2,
     metric=:mse,
-    return_logger=true
+    device=:gpu
 );
 
 # nfeats = length(feature_names)
diff --git a/benchmarks/YEAR-tweedie.jl b/benchmarks/YEAR-tweedie.jl
index 61c8009..f928a40 100644
--- a/benchmarks/YEAR-tweedie.jl
+++ b/benchmarks/YEAR-tweedie.jl
@@ -46,22 +46,16 @@ deval = df_tot[eval_idx, :];
 dtest = df_tot[(end-51630+1):end, :];
 
 config = NeuroTreeRegressor(
-    device=:gpu,
     loss=:tweedie_deviance,
-    actA=:tanh,
+    actA=:identity,
     nrounds=200,
-    outsize=1,
     depth=4,
-    ntrees=64,
-    hidden_size=8,
-    stack_size=1,
-    init_scale=1.0,
-    MLE_tree_split=true,
+    ntrees=32,
     batchsize=2048,
     lr=1e-3,
 )
 
-@time m, logger = NeuroTreeModels.fit(
+@time m = NeuroTreeModels.fit(
     config,
     dtrain;
     deval,
@@ -70,7 +64,7 @@ config = NeuroTreeRegressor(
     print_every_n=5,
     early_stopping_rounds=2,
     metric=:tweedie_deviance,
-    return_logger=true
+    device=:gpu
 );
 
 p_eval = m(deval);
diff --git a/benchmarks/Yahoo-LTRC.jl b/benchmarks/Yahoo-LTRC.jl
index b9111c2..02e68a8 100644
--- a/benchmarks/Yahoo-LTRC.jl
+++ b/benchmarks/Yahoo-LTRC.jl
@@ -94,7 +94,6 @@ target_name = "y"
 # training
 #####################################
 config = NeuroTreeRegressor(
-    device=:gpu,
     loss=:logloss,
     nrounds=400,
     actA=:identity,
@@ -102,7 +101,6 @@ config = NeuroTreeRegressor(
     scaler=true,
     depth=4,
     ntrees=256,
-    outsize=1,
     hidden_size=1,
     stack_size=1,
     batchsize=1024,
@@ -110,7 +108,7 @@ config = NeuroTreeRegressor(
     lr=3e-4,
 )
 
-@time m, logger = NeuroTreeModels.fit(
+@time m = NeuroTreeModels.fit(
     config,
     dtrain;
     deval,
@@ -119,7 +117,7 @@ config = NeuroTreeRegressor(
     print_every_n=5,
     early_stopping_rounds=3,
     metric=:logloss,
-    return_logger=true
+    device=:gpu,
 );
 
 dinfer = NeuroTreeModels.get_df_loader_infer(dtest; feature_names, batchsize=config.batchsize, device=config.device);
diff --git a/benchmarks/aicrowd-test.jl b/benchmarks/aicrowd-test.jl
index cc2269c..d9155d8 100644
--- a/benchmarks/aicrowd-test.jl
+++ b/benchmarks/aicrowd-test.jl
@@ -51,15 +51,12 @@ y_train = Vector{Float32}(df_train[:, target])
 y_eval = Vector{Float32}(df_eval[:, target])
 
 config = NeuroTreeRegressor(
-    device = :gpu,
     loss = :logloss,
     nrounds = 100,
     actA = :tanh,
     scaler = false,
-    outsize = 1,
     depth = 4,
     ntrees = 32,
-    masks = nothing,
     batchsize = 4096,
     rng = 123,
     opt = Dict("type" => "nadam", "lr" => 3e-2, "rho" => 0.9),
@@ -75,6 +72,7 @@ config = NeuroTreeRegressor(
     early_stopping_rounds = 5,
     print_every_n = 1,
     metric = :logloss,
+    device = :gpu,
 );
 
 using CUDA
diff --git a/src/learners.jl b/src/learners.jl
index 4a5a75f..5c95eee 100644
--- a/src/learners.jl
+++ b/src/learners.jl
@@ -4,29 +4,31 @@ abstract type MAE <: LossType end
 abstract type LogLoss <: LossType end
 abstract type MLogLoss <: LossType end
 abstract type GaussianMLE <: LossType end
+abstract type TweedieDeviance <: LossType end
 
 const _loss_type_dict = Dict(
-    :mse => MSE,
-    :mae => MAE,
-    :logloss => LogLoss,
-    :gaussian_mle => GaussianMLE,
-    :mlogloss => MLogLoss
+  :mse => MSE,
+  :mae => MAE,
+  :logloss => LogLoss,
+  :tweedie_deviance => TweedieDeviance,
+  :gaussian_mle => GaussianMLE,
+  :mlogloss => MLogLoss
 )
 
 mutable struct NeuroTreeRegressor <: MMI.Deterministic
-    loss::Symbol
-    nrounds::Int
-    lr::Float32
-    wd::Float32
-    batchsize::Int
-    actA::Symbol
-    depth::Int
-    ntrees::Int
-    hidden_size::Int
-    stack_size::Int
-    init_scale::Float32
-    MLE_tree_split::Bool
-    rng::Any
+  loss::Symbol
+  nrounds::Int
+  lr::Float32
+  wd::Float32
+  batchsize::Int
+  actA::Symbol
+  depth::Int
+  ntrees::Int
+  hidden_size::Int
+  stack_size::Int
+  init_scale::Float32
+  MLE_tree_split::Bool
+  rng::Any
 end
 
 """
@@ -145,77 +147,77 @@ p = predict(mach, X)
 """
 function NeuroTreeRegressor(; kwargs...)
 
-    # defaults arguments
-    args = Dict{Symbol,Any}(
-        :loss => :mse,
-        :nrounds => 10,
-        :lr => 1.0f-2,
-        :wd => 0.0f0,
-        :batchsize => 2048,
-        :actA => :tanh,
-        :depth => 4,
-        :ntrees => 64,
-        :hidden_size => 1,
-        :stack_size => 1,
-        :init_scale => 0.1,
-        :MLE_tree_split => false,
-        :rng => 123,
-    )
-
-    args_ignored = setdiff(keys(kwargs), keys(args))
-    args_ignored_str = join(args_ignored, ", ")
-    length(args_ignored) > 0 &&
-        @info "Following $(length(args_ignored)) provided arguments will be ignored: $(args_ignored_str)."
-
-    args_default = setdiff(keys(args), keys(kwargs))
-    args_default_str = join(args_default, ", ")
-    length(args_default) > 0 &&
-        @info "Following $(length(args_default)) arguments were not provided and will be set to default: $(args_default_str)."
-
-    args_override = intersect(keys(args), keys(kwargs))
-    for arg in args_override
-        args[arg] = kwargs[arg]
-    end
-
-    loss = Symbol(args[:loss])
-    loss ∉ [:mse, :mae, :logloss, :gaussian_mle] && error("The provided kwarg `loss`: $loss is not supported.")
-
-    args[:rng] = mk_rng(args[:rng])
-
-    config = NeuroTreeRegressor(
-        args[:loss],
-        args[:nrounds],
-        Float32(args[:lr]),
-        Float32(args[:wd]),
-        args[:batchsize],
-        Symbol(args[:actA]),
-        args[:depth],
-        args[:ntrees],
-        args[:hidden_size],
-        args[:stack_size],
-        args[:init_scale],
-        args[:MLE_tree_split],
-        args[:rng]
-    )
-
-    return config
+  # defaults arguments
+  args = Dict{Symbol,Any}(
+    :loss => :mse,
+    :nrounds => 10,
+    :lr => 1.0f-2,
+    :wd => 0.0f0,
+    :batchsize => 2048,
+    :actA => :tanh,
+    :depth => 4,
+    :ntrees => 64,
+    :hidden_size => 1,
+    :stack_size => 1,
+    :init_scale => 0.1,
+    :MLE_tree_split => false,
+    :rng => 123,
+  )
+
+  args_ignored = setdiff(keys(kwargs), keys(args))
+  args_ignored_str = join(args_ignored, ", ")
+  length(args_ignored) > 0 &&
+    @info "Following $(length(args_ignored)) provided arguments will be ignored: $(args_ignored_str)."
+
+  args_default = setdiff(keys(args), keys(kwargs))
+  args_default_str = join(args_default, ", ")
+  length(args_default) > 0 &&
+    @info "Following $(length(args_default)) arguments were not provided and will be set to default: $(args_default_str)."
+
+  args_override = intersect(keys(args), keys(kwargs))
+  for arg in args_override
+    args[arg] = kwargs[arg]
+  end
+
+  loss = Symbol(args[:loss])
+  loss ∉ [:mse, :mae, :logloss, :gaussian_mle, :tweedie_deviance] && error("The provided kwarg `loss`: $loss is not supported.")
+
+  args[:rng] = mk_rng(args[:rng])
+
+  config = NeuroTreeRegressor(
+    args[:loss],
+    args[:nrounds],
+    Float32(args[:lr]),
+    Float32(args[:wd]),
+    args[:batchsize],
+    Symbol(args[:actA]),
+    args[:depth],
+    args[:ntrees],
+    args[:hidden_size],
+    args[:stack_size],
+    args[:init_scale],
+    args[:MLE_tree_split],
+    args[:rng]
+  )
+
+  return config
 end
 
 
 mutable struct NeuroTreeClassifier <: MMI.Probabilistic
-    loss::Symbol
-    nrounds::Int
-    lr::Float32
-    wd::Float32
-    batchsize::Int
-    actA::Symbol
-    depth::Int
-    ntrees::Int
-    hidden_size::Int
-    stack_size::Int
-    init_scale::Float32
-    MLE_tree_split::Bool
-    rng::Any
+  loss::Symbol
+  nrounds::Int
+  lr::Float32
+  wd::Float32
+  batchsize::Int
+  actA::Symbol
+  depth::Int
+  ntrees::Int
+  hidden_size::Int
+  stack_size::Int
+  init_scale::Float32
+  MLE_tree_split::Bool
+  rng::Any
 end
 
 """
@@ -328,56 +330,56 @@ p = predict(mach, X)
 """
 function NeuroTreeClassifier(; kwargs...)
 
-    # defaults arguments
-    args = Dict{Symbol,Any}(
-        :nrounds => 10,
-        :lr => 1.0f-2,
-        :wd => 0.0f0,
-        :batchsize => 2048,
-        :actA => :tanh,
-        :depth => 4,
-        :ntrees => 64,
-        :hidden_size => 1,
-        :stack_size => 1,
-        :init_scale => 0.1,
-        :MLE_tree_split => false,
-        :rng => 123,
-    )
-
-    args_ignored = setdiff(keys(kwargs), keys(args))
-    args_ignored_str = join(args_ignored, ", ")
-    length(args_ignored) > 0 &&
-        @info "Following $(length(args_ignored)) provided arguments will be ignored: $(args_ignored_str)."
-
-    args_default = setdiff(keys(args), keys(kwargs))
-    args_default_str = join(args_default, ", ")
-    length(args_default) > 0 &&
-        @info "Following $(length(args_default)) arguments were not provided and will be set to default: $(args_default_str)."
-
-    args_override = intersect(keys(args), keys(kwargs))
-    for arg in args_override
-        args[arg] = kwargs[arg]
-    end
-
-    args[:rng] = mk_rng(args[:rng])
-
-    config = NeuroTreeClassifier(
-        :mlogloss,
-        args[:nrounds],
-        Float32(args[:lr]),
-        Float32(args[:wd]),
-        args[:batchsize],
-        Symbol(args[:actA]),
-        args[:depth],
-        args[:ntrees],
-        args[:hidden_size],
-        args[:stack_size],
-        args[:init_scale],
-        args[:MLE_tree_split],
-        args[:rng],
-    )
-
-    return config
+  # defaults arguments
+  args = Dict{Symbol,Any}(
+    :nrounds => 10,
+    :lr => 1.0f-2,
+    :wd => 0.0f0,
+    :batchsize => 2048,
+    :actA => :tanh,
+    :depth => 4,
+    :ntrees => 64,
+    :hidden_size => 1,
+    :stack_size => 1,
+    :init_scale => 0.1,
+    :MLE_tree_split => false,
+    :rng => 123,
+  )
+
+  args_ignored = setdiff(keys(kwargs), keys(args))
+  args_ignored_str = join(args_ignored, ", ")
+  length(args_ignored) > 0 &&
+    @info "Following $(length(args_ignored)) provided arguments will be ignored: $(args_ignored_str)."
+
+  args_default = setdiff(keys(args), keys(kwargs))
+  args_default_str = join(args_default, ", ")
+  length(args_default) > 0 &&
+    @info "Following $(length(args_default)) arguments were not provided and will be set to default: $(args_default_str)."
+
+  args_override = intersect(keys(args), keys(kwargs))
+  for arg in args_override
+    args[arg] = kwargs[arg]
+  end
+
+  args[:rng] = mk_rng(args[:rng])
+
+  config = NeuroTreeClassifier(
+    :mlogloss,
+    args[:nrounds],
+    Float32(args[:lr]),
+    Float32(args[:wd]),
+    args[:batchsize],
+    Symbol(args[:actA]),
+    args[:depth],
+    args[:ntrees],
+    args[:hidden_size],
+    args[:stack_size],
+    args[:init_scale],
+    args[:MLE_tree_split],
+    args[:rng],
+  )
+
+  return config
 end
 
 const NeuroTypes = Union{NeuroTreeRegressor,NeuroTreeClassifier}
diff --git a/src/loss.jl b/src/loss.jl
index ead9dd2..dc4bc6a 100644
--- a/src/loss.jl
+++ b/src/loss.jl
@@ -56,17 +56,17 @@ end
 function mlogloss(m, x, y)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    mean(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1))
+    mean(-sum(onehotbatch(y, 1:k) .* p; dims=1))
 end
 function mlogloss(m, x, y, w)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    sum(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1) .* w) / sum(w)
+    sum(-sum(onehotbatch(y, 1:k) .* p; dims=1) .* w) / sum(w)
 end
 function mlogloss(m, x, y, w, offset)
     p = logsoftmax(m(x) .+ offset; dims=1)
     k = size(p, 1)
-    sum(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1) .* w) / sum(w)
+    sum(-sum(onehotbatch(y, 1:k) .* p; dims=1) .* w) / sum(w)
 end
 
 gaussian_mle_loss(μ::AbstractVector{T}, σ::AbstractVector{T}, y::AbstractVector{T}) where {T} =
diff --git a/src/metrics.jl b/src/metrics.jl
index d4b7c23..b0bbcd4 100644
--- a/src/metrics.jl
+++ b/src/metrics.jl
@@ -100,21 +100,21 @@ end
 function mlogloss(m, x, y; agg=mean)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    raw = dropdims(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1); dims=1)
+    raw = dropdims(-sum(onehotbatch(y, 1:k) .* p; dims=1); dims=1)
     metric = agg(raw)
     return metric
 end
 function mlogloss(m, x, y, w; agg=mean)
     p = logsoftmax(m(x); dims=1)
     k = size(p, 1)
-    raw = dropdims(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1); dims=1)
+    raw = dropdims(-sum(onehotbatch(y, 1:k) .* p; dims=1); dims=1)
     metric = agg(raw .* w)
     return metric
 end
 function mlogloss(m, x, y, w, offset; agg=mean)
     p = logsoftmax(m(x) .+ offset; dims=1)
     k = size(p, 1)
-    raw = dropdims(-sum(onehotbatch(UInt32.(y), 1:k) .* p; dims=1); dims=1)
+    raw = dropdims(-sum(onehotbatch(y, 1:k) .* p; dims=1); dims=1)
     metric = agg(raw .* w)
     return metric
 end