FluxML · CarloLucibello · Jan 17, 2026 · Dec 26, 2025 · Jan 4, 2026 · Jan 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,7 @@ docs/build/
 docs/site/
 deps
 .vscode
-Manifest.toml
+Manifest*toml
 LocalPreferences.toml
 .DS_Store
 docs/mymodel.bson

diff --git a/NEWS.md b/NEWS.md
@@ -2,11 +2,71 @@
 
 See also [github's page](https://github.com/FluxML/Flux.jl/releases) for a complete list of PRs merged before each release.
 
-## v0.16.1 (25 December 2025)
+## v0.16.8 (January 2025)
 
-The default `init_score` value for `early_stopping` has been set to `Inf` (instead of `0`) in order to prevent unexpected behavior if the defaults were not modified. Documentation has been updated to explain that, if the user needs to track a metric where improvement is shown by increasing values, then the `init_score` needs to be adjusted, for example, to `-Inf`). Tests corresponding to `early_stopping` have been reorganized and extended to be more detailed and illustrative of `early_stopping`'s behavior.
+This release includes the following changes:
+- Added support in `Flux.gradient` and `Flux.withgradient` to alternative AD backends such as `AutoEnzyme()` and `AutoMooncake()`.
+- The default `init_score` value for `early_stopping` has been set to `Inf` (instead of `0`) in order to prevent unexpected behavior if the defaults were not modified. 
 
-## v0.16.0 (15 December 2025)
+## v0.16.7 (10 December 2025)
+
+This patch release includes:
+
+* Minor documentation fixes and housekeeping commits.
+* Compatibility updates for downstream packages.
+
+
+## v0.16.6 (8 December 2025)
+
+This patch release includes:
+
+* Minor dependency bumps and CI updates.
+* Preparatory changes ahead of v0.16.7.
+
+## v0.16.5 (23 July 2025)
+
+This release includes:
+
+* Fix typos in legacy tutorials documentation.([GitHub][2])
+* Bump compatibility for `AMDGPU` in weak dependencies.([GitHub][2])
+* **Fix** for `unsafe_free!` failure with certain `CuArray` configurations.([GitHub][2])
+
+## v0.16.4 (2 June 2025)
+
+This release includes:
+
+* Fix missing imports in `FluxMPIExt`.([GitHub][1])
+* Add shape validation for convolution weight tensors.([GitHub][1])
+* Disable and fix intermittent Reactant tests.([GitHub][1])
+* Fix recurrent docstrings and pooling layer loading.([GitHub][1])
+* Small test updates and miscellaneous doc fixes.([GitHub][1])
+
+## v0.16.3 (6 February 2025)
+
+This release includes:
+
+* **Fix** for `cpu(dataloader)` behavior.([GitHub][1])
+* Addressed data loading and preprocessing pipeline issues.([GitHub][1])
+* Resolved “Infinite time of gradient” edge case.([GitHub][1])
+
+## v0.16.2 (21 January 2025)
+
+This release includes:
+
+* Updated dependencies and bumped to v0.16.1 as a base.([GitHub][1])
+* **Fixes** around new gradients, precompilation on Julia 1.12, and export issues.([GitHub][1])
+
+## v0.16.1 (13 January 2025)
+
+This release includes:
+
+* Added references to recurrent layers in `ecosystem.md`.([GitHub][1])
+* Fixed typo in recurrence documentation.([GitHub][1])
+* Added “return state” option to recurrent layers.([GitHub][1])
+* Updated schedulers docs, collapsed docstrings in layers docs.([GitHub][1])
+* Test fixes for Enzyme and Reactant forward/reverse passes.([GitHub][1])
+
+## v0.16.0 (15 December 2024)
 This release has a single **breaking change**:
 
 - The recurrent cells `RNNCell`, `LSTMCell`, and `GRUCell` forward has been changed to 

diff --git a/Project.toml b/Project.toml
@@ -2,10 +2,8 @@ name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 version = "0.16.7"
 
-[workspace]
-projects = ["test", "docs"]
-
 [deps]
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
@@ -33,7 +31,9 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
@@ -42,23 +42,28 @@ FluxAMDGPUExt = "AMDGPU"
 FluxCUDAExt = "CUDA"
 FluxCUDAcuDNNExt = ["CUDA", "cuDNN"]
 FluxEnzymeExt = "Enzyme"
+FluxFiniteDifferencesExt = "FiniteDifferences"
 FluxMPIExt = "MPI"
 FluxMPINCCLExt = ["CUDA", "MPI", "NCCL"]
+FluxMooncakeExt = "Mooncake"
 
 [compat]
+ADTypes = "1"
 AMDGPU = "1, 2"
 Adapt = "4"
 CUDA = "5"
 ChainRulesCore = "1.12"
 Compat = "4.10.0"
 Enzyme = "0.13"
 EnzymeCore = "0.7.7, 0.8.4"
+FiniteDifferences = "0.12"
 Functors = "0.5"
 MLCore = "1.0.0"
 MLDataDevices = "1.4.2"
 MLUtils = "0.4"
 MPI = "0.20.19"
 MacroTools = "0.5"
+Mooncake = "0.4"
 NCCL = "0.1.1"
 NNlib = "0.9.22"
 OneHotArrays = "0.2.4"
@@ -72,3 +77,6 @@ Statistics = "1"
 Zygote = "0.6.67, 0.7"
 cuDNN = "1"
 julia = "1.10"
+
+[workspace]
+projects = ["test", "docs"]
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -10,12 +10,16 @@ MLCore = "c2834f40-e789-41da-a90e-33b280584a8c"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
+[sources]
+Flux = {path = ".."}
+
 [compat]
 Documenter = "1.3"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,6 +1,7 @@
-using Documenter, Flux, NNlib, Functors, MLUtils, BSON, Optimisers, 
+using Documenter
+using Flux, NNlib, Functors, MLUtils, BSON, Optimisers, 
       OneHotArrays, Zygote, ChainRulesCore, Plots, MLDatasets, Statistics, 
-      DataFrames, JLD2, MLDataDevices, MLCore
+      DataFrames, JLD2, MLDataDevices, MLCore, Mooncake
 using MLCore: numobs, getobs, getobs!
 
 ENV["DATADEPS_ALWAYS_ACCEPT"] = true
@@ -21,7 +22,8 @@ makedocs(
         sidebar_sitename = false,
         analytics = "UA-36890222-9",
         assets = ["assets/flux.css"],
-        prettyurls = get(ENV, "CI", nothing) == "true"
+        prettyurls = get(ENV, "CI", nothing) == "true",
+        size_threshold=1_000_000,
     ),
     pages = [
         "Welcome" => "index.md",
@@ -50,8 +52,7 @@ makedocs(
             "Shape Inference" => "reference/outputsize.md",
             "Flat vs. Nested" => "reference/destructure.md",
             "Callback Helpers" => "reference/training/callbacks.md",
-            "Gradients -- Zygote.jl" => "reference/training/zygote.md",
-            "Gradients -- Enzyme.jl" => "reference/training/enzyme.md",
+            "Gradients" => "reference/training/gradients.md",
             "Transfer Data to GPU -- MLDataDevices.jl" => "reference/data/mldatadevices.md",
             "Batching Data -- MLUtils.jl" => "reference/data/mlutils.md",
             "OneHotArrays.jl" => "reference/data/onehot.md",

diff --git a/docs/src/guide/models/basics.md b/docs/src/guide/models/basics.md
@@ -181,9 +181,7 @@ These matching nested structures are at the core of how Flux works.
     This method of `gradient` takes a zero-argument function, which only *implicitly*
     depends on `θ`.
 
-```@raw html
-<h3><img src="../../../assets/zygote-crop.png" width="40px"/>&nbsp;<a href="https://github.com/FluxML/Zygote.jl">Zygote.jl</a></h3>
-```
+## Automatic Differentiation
 
 Flux's [`gradient`](@ref Flux.gradient) function by default calls a companion packages called [Zygote](https://github.com/FluxML/Zygote.jl).
 Zygote performs source-to-source automatic differentiation, meaning that `gradient(f, x)`
@@ -198,7 +196,7 @@ Flux can also be used with other automatic differentiation (AD) packages.
 It was originally written using [Tracker](https://github.com/FluxML/Tracker.jl), a more traditional operator-overloading approach.
 The future might be [Enzyme](https://github.com/EnzymeAD/Enzyme.jl), and Flux now builds in an easy way to use this instead, turned on by wrapping the model in `Duplicated`. (For details, see the [Enzyme page](@ref autodiff-enzyme) in the manual.)
 
-```julia
+```julia-repl
 julia> using Enzyme: Const, Duplicated
 
 julia> grad3e = Flux.gradient((x,p) -> p(x), Const(5.0), Duplicated(poly3s))
@@ -210,13 +208,33 @@ Here, this is because `Const(5.0)` is explicitly constant.
 Below, we will see an example where `nothing` shows up because the model struct has fields containing things other than parameters, such as an activation function.
 (It also adopts the convention that `gradient(f, x, y)` returns a tuple `(∂f/∂x, ∂f/∂y)`, without a "`∂f/∂f`" term for the function. This is why we had to write `gradient(|>, 5, poly4)` above, not just `gradient(poly4, 5)`.)
 
-Finally, the function [`withgradient`](@ref) works the same way, but also returns the value of the function:
+The function [`withgradient`](@ref) works the same way, but also returns the value of the function:
 
 ```jldoctest poly
 julia> Flux.withgradient((x,p) -> p(x), 5.0, poly3s)
 (val = 17.5, grad = (2.0, (θ3 = [1.0, 5.0, 25.0],)))
 ```
 
+One can also directly specify which AD backend to use, by passing an adtype among the supported ones
+(`AutoMooncake, AutoEnzyme, AutoZygote, AutoFiniteDifferences`) as the second argument.
+The corresponding AD package has to be loaded first.
+
+Here is an example using [Mooncake](https://github.com/chalk-lab/Mooncake.jl):
+```jldoctest poly
+julia> using Mooncake
+
+julia> Flux.withgradient((x,p) -> p(x), AutoMooncake(), 5.0, poly3s)
+(val = 17.5, grad = (2.0, Poly3{Vector{Float64}}([1.0, 5.0, 25.0])))
+```
+
+and here is the same example using Enzyme:
+```julia-repl
+julia> using Enzyme
+
+julia> Flux.withgradient((x,p) -> p(x), AutoEnzyme(), 5.0, poly3s)
+(val = 17.5, grad = (2.0, Poly3{Vector{Float64}}([1.0, 5.0, 25.0])))
+```
+
 ## Simple Neural Networks
 
 The polynomial functions above send a number `x` to another a number `y`.

diff --git a/docs/src/reference/data/mlutils.md b/docs/src/reference/data/mlutils.md
@@ -25,6 +25,7 @@ these functions help create inputs for your models or batch your dataset.
 MLUtils.batch
 MLUtils.batchsize
 MLUtils.batchseq
+MLUtils.batch_sequence
 MLUtils.BatchView
 MLUtils.chunk
 MLUtils.eachobs

diff --git a/docs/src/reference/training/enzyme.md → docs/src/reference/training/gradients.md b/docs/src/reference/training/enzyme.md → docs/src/reference/training/gradients.md
@@ -1,5 +1,64 @@
+```@meta
+CollapsedDocStrings = true
+```
+
+# Automatic Differentiation in Flux
+
+Flux's `gradient` function uses [Zygote](https://github.com/FluxML/Zygote.jl) by default, and also uses this function within [`train!`](@ref Flux.train!) to differentiate the model.
+Zygote has its own [documentation](https://fluxml.ai/Zygote.jl/dev/), in particular listing some [important limitations](https://fluxml.ai/Zygote.jl/dev/limitations/).
 
-# [Automatic Differentiation using Enzyme.jl](@id autodiff-enzyme)
+Flux also has support for Enzyme.jl, documented [below](@ref autodiff-enzyme) and for Mooncake.jl.
+
+
+## Generic Gradient Interface
+
+```@docs
+Flux.gradient(f, adtype::AbstractADType, args::Any...)
+Flux.withgradient(f, adtype::AbstractADType, args::Any...)
+```
+
+## [Automatic Differentiation using Zygote.jl](@id autodiff-zygote)
+
+The default AD backend in Flux is Zygote. Besides  gradient calculation, Zygote also supports
+higher-order derivatives, Jacobians, Hessians, and pullbacks.
+
+```@docs
+Zygote.jacobian(f, args...)
+Zygote.withjacobian(f, args...)
+Zygote.hessian
+Zygote.hessian_reverse
+Zygote.diaghessian
+Zygote.pullback
+```
+
+## ChainRules for Zygote
+
+Zygote uses [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl) to define how to differentiate functions.
+
+Sometimes it is necessary to exclude some code, or a whole function, from automatic differentiation. 
+This can be done using the following methods:
+
+```@docs
+ChainRulesCore.ignore_derivatives
+ChainRulesCore.@non_differentiable
+```
+
+To manually supply the gradient for one function, you should define a method of `rrule`. ChainRules has [detailed documentation](https://juliadiff.org/ChainRulesCore.jl/stable/) on how this works.
+
+```@docs
+ChainRulesCore.rrule
+ChainRulesCore.frule
+ChainRulesCore.@scalar_rule
+ChainRulesCore.NoTangent
+ChainRulesCore.ZeroTangent
+ChainRulesCore.RuleConfig
+ChainRulesCore.Tangent
+ChainRulesCore.canonicalize
+```
+
+Gradient customization for other AD packages such as Enzyme and Mooncake has to be done according to their own documentation.
+
+## [Automatic Differentiation using Enzyme.jl](@id autodiff-enzyme)
 
 [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl) is a new package for automatic differentiation.
 Like Zygote.jl, calling `gradient(f, x)` causes it to hooks into the compiler and transform code that is executed while calculating `f(x)`, in order to produce code for `∂f/∂x`.
@@ -71,25 +130,16 @@ true
 Note that what `Enzyme.gradient` returns is an object like `deepcopy(model)` of the same type, `grads_e[1] isa Chain`.
 But its fields contain the same gradient.
 
-There is also a method of `train!` which similarly takes `Duplicated(model)`:
-
-```julia-repl
-julia> opt_state = Flux.setup(Adam(0), model);
-
-julia> Flux.train!((m,x,y) -> sum(abs2, m(x) .- y), dup_model, [(x1, y1)], opt_state)
-```
-
-## Second-order AD
-
-If you calculate a gradient within the loss function, then training will involve 2nd derivatives.
-While this is in principle supported by Zygote.jl, there are many bugs, and Enzyme.jl is probably a better choice.
-
-## Listing
 
 ```@docs
 Flux.gradient(f, args::Union{Flux.EnzymeCore.Const, Flux.EnzymeCore.Duplicated}...)
 Flux.withgradient(f, args::Union{Flux.EnzymeCore.Const, Flux.EnzymeCore.Duplicated}...)
-Flux.train!(loss, model::Flux.EnzymeCore.Duplicated, data, opt)
 ```
 
 Enzyme.jl has [its own extensive documentation](https://enzymead.github.io/Enzyme.jl/stable/).
+
+
+## Second-order AD
+
+If you calculate a gradient within the loss function, then training will involve 2nd derivatives.
+While this is in principle supported by Zygote.jl, there are many bugs, and Enzyme.jl is probably a better choice.
diff --git a/docs/src/reference/training/reference.md b/docs/src/reference/training/reference.md
@@ -28,6 +28,19 @@ Optimisers.setup
 To see one in a terminal, you will need to install [TerminalLoggers.jl](https://github.com/JuliaLogging/TerminalLoggers.jl)
 and follow its setup instructions.
 
+
+There is also a method of `train!` which similarly takes `Duplicated(model)` and uses Enzyme.jl for differentiation (see (@ref autodiff-enzyme)):
+```julia-repl
+julia> opt_state = Flux.setup(Adam(0), model);
+
+julia> Flux.train!((m,x,y) -> sum(abs2, m(x) .- y), dup_model, [(x1, y1)], opt_state)
+```
+
+```@docs
+Flux.train!(loss, model::Flux.EnzymeCore.Duplicated, data, opt)
+```
+
+
 ## Optimisation Modifiers
 
 The state returned by `setup` can be modified to temporarily prevent training of