fix: caching in Reactant backend

avik-pal · avik-pal · commit 0fe8fa81e967 · 2025-11-12T22:29:01.000-05:00
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -21,7 +21,6 @@ LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
-OpenSSL_jll = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -62,7 +61,6 @@ LuxLib = "1.3.4"
 LuxTestUtils = "2"
 MLDataDevices = "1.6.10"
 NNlib = "0.9.27"
-OpenSSL_jll = "=3.0.16"
 Optimisers = "0.4.6"
 Printf = "1.10"
 Random = "1.10"
diff --git a/docs/src/introduction/index.md b/docs/src/introduction/index.md
@@ -20,12 +20,11 @@ Pkg.add("Lux")
 
 !!! tip "Pre-Requisites"
 
-    You need to install `Optimisers` and `Zygote` if not done already.
-    `Pkg.add(["Optimisers", "Zygote"])`
+    You need to install `Optimisers`, `Reactant` and `Enzyme` if not done already.
+    `Pkg.add(["Optimisers", "Enzyme", "Reactant"])`
 
 ```@example quickstart
-using Lux, Random, Optimisers, Zygote
-# using LuxCUDA, AMDGPU, Metal, oneAPI # Optional packages for GPU support
+using Lux, Random, Optimisers, Enzyme, Reactant
 ```
 
 We take randomness very seriously
@@ -40,7 +39,7 @@ Build the model
 
 ```@example quickstart
 # Construct the layer
-model = Chain(Dense(128, 256, tanh), Chain(Dense(256, 1, tanh), Dense(1, 10)))
+model = Chain(Dense(128, 256, tanh), Chain(Dense(256, 256, tanh), Dense(256, 10)))
 ```
 
 Models don't hold parameters and states so initialize them. From there on, we can just use
@@ -49,7 +48,7 @@ API that provides an uniform API over all supported AD systems.
 
 ```@example quickstart
 # Get the device determined by Lux
-dev = gpu_device()
+dev = reactant_device()
 
 # Parameter and State Variables
 ps, st = Lux.setup(rng, model) |> dev
@@ -58,25 +57,35 @@ ps, st = Lux.setup(rng, model) |> dev
 x = rand(rng, Float32, 128, 2) |> dev
 
 # Run the model
-y, st = Lux.apply(model, x, ps, st)
+## We need to use @jit to compile and run the model with Reactant
+y, st = @jit Lux.apply(model, x, ps, st)
+
+## For best performance, first compile the model with Reactant and then run it
+apply_compiled = @compile Lux.apply(model, x, ps, st)
+apply_compiled(model, x, ps, st)
 
 # Gradients
 ## First construct a TrainState
-train_state = Lux.Training.TrainState(model, ps, st, Adam(0.0001f0))
+train_state = Training.TrainState(model, ps, st, Adam(0.0001f0))
 
 ## We can compute the gradients using Training.compute_gradients
+## TrainState handles compilation internally
 gs, loss, stats, train_state = Lux.Training.compute_gradients(
-    AutoZygote(), MSELoss(),
-    (x, dev(rand(rng, Float32, 10, 2))), train_state
+    AutoEnzyme(),
+    MSELoss(),
+    (x, dev(rand(rng, Float32, 10, 2))),
+    train_state
 )
 
 ## Optimization
 train_state = Training.apply_gradients!(train_state, gs) # or Training.apply_gradients (no `!` at the end)
 
-# Both these steps can be combined into a single call
+# Both these steps can be combined into a single call (preferred approach)
 gs, loss, stats, train_state = Training.single_train_step!(
-    AutoZygote(), MSELoss(),
-    (x, dev(rand(rng, Float32, 10, 2))), train_state
+    AutoEnzyme(),
+    MSELoss(),
+    (x, dev(rand(rng, Float32, 10, 2))),
+    train_state
 )
 ```
 
diff --git a/ext/LuxReactantExt/training.jl b/ext/LuxReactantExt/training.jl
@@ -70,42 +70,42 @@ function compute_gradients_internal(objective_function::F, model, data, ps, st)
     )
 end
 
-Profiler.@annotate "Compile Compute Gradients" function Lux.Training.compute_gradients_impl(
+Profiler.@annotate "Compute Gradients" function Lux.Training.compute_gradients_impl(
     backend::ReactantBackend, objective_function::F, data, ts::Training.TrainState
 ) where {F}
-    compiled_gradient_function = with_default_precision_config(ts.parameters) do
-        @compile sync = backend.sync compute_gradients_internal(
-            objective_function, ts.model, data, ts.parameters, ts.states
-        )
+    if (
+        ts.cache isa TrainingBackendCache &&
+        hasfield(typeof(ts.cache.extras), :compiled_gradient_function)
+    )
+        compiled_gradient_function = ts.cache.extras.compiled_gradient_function
+    else
+        compiled_gradient_function = with_default_precision_config(ts.parameters) do
+            @compile sync = backend.sync compute_gradients_internal(
+                objective_function, ts.model, data, ts.parameters, ts.states
+            )
+        end
+
+        if ts.cache isa TrainingBackendCache
+            @set! ts.cache.extras = merge(ts.cache.extras, (; compiled_gradient_function))
+        else
+            cache = TrainingBackendCache(
+                backend, False(), nothing, (; compiled_gradient_function)
+            )
+            @set! ts.cache = cache
+        end
+        @set! ts.objective_function = objective_function
     end
 
     grads, loss, stats, st = compiled_gradient_function(
         objective_function, ts.model, data, ts.parameters, ts.states
     )
 
-    cache = TrainingBackendCache(backend, False(), nothing, (; compiled_gradient_function))
-    @set! ts.cache = cache
-    @set! ts.objective_function = objective_function
-    @set! ts.states = st
-    return grads, loss, stats, ts
-end
-
-Profiler.@annotate "Compute Gradients" function Lux.Training.compute_gradients_impl(
-    ::ReactantBackend,
-    obj_fn::F,
-    data,
-    ts::Training.TrainState{<:TrainingBackendCache{<:ReactantBackend},F},
-) where {F}
-    grads, loss, stats, st = ts.cache.extras.compiled_gradient_function(
-        obj_fn, ts.model, data, ts.parameters, ts.states
-    )
     @set! ts.states = st
     return grads, loss, stats, ts
 end
 
 for inplace in ("!", "")
     fname = Symbol(:single_train_step_impl, inplace)
-    internal_fn = Symbol(:compute_gradients_internal_and_step, inplace)
     apply_gradients_fn = Symbol(:apply_gradients, inplace)
     update_fn = Symbol(:update, inplace)
 
@@ -141,110 +141,108 @@ for inplace in ("!", "")
     end
 
     # XXX: recompile with a warning if new input types are used
-    @eval Profiler.@annotate "Compile Train Step" function Lux.Training.$(fname)(
+    @eval Profiler.@annotate "Train Step" function Lux.Training.$(fname)(
         backend::ReactantBackend, objective_function::F, data, ts::Training.TrainState
     ) where {F}
-        device = get_device((ts.parameters, ts.states, ts.optimizer_state, data))
-        @assert device isa ReactantDevice
-        is_sharded = device.device === nothing
-
-        dps = if backend.return_gradients isa True
-            Functors.fmap(Utils.zero, ts.parameters; exclude=MLDataDevices.isleaf)
+        if (
+            ts.cache isa TrainingBackendCache &&
+            hasfield(typeof(ts.cache.extras), :compiled_grad_and_step_function)
+        )
+            (; compiled_grad_and_step_function, is_sharded) = ts.cache.extras
+            ps = ts.parameters
+            dparameters = ts.cache.dparameters
         else
-            nothing
-        end
+            device = get_device((ts.parameters, ts.states, ts.optimizer_state, data))
+            @assert device isa ReactantDevice
+            is_sharded = device.device === nothing
+
+            dparameters = if backend.return_gradients isa True
+                Functors.fmap(Utils.zero, ts.parameters; exclude=MLDataDevices.isleaf)
+            else
+                nothing
+            end
 
-        $(ps_expr)
-
-        compiled_grad_and_step_function = with_default_precision_config(ts.parameters) do
-            @compile sync = backend.sync $(internal_fn)(
-                objective_function,
-                ts.model,
-                data,
-                ps,
-                ts.states,
-                ts.optimizer_state,
-                dps,
-                is_sharded,
-            )
+            $(ps_expr)
+
+            compiled_grad_and_step_function =
+                with_default_precision_config(ts.parameters) do
+                    @compile sync = backend.sync compute_gradients_internal_and_step!(
+                        objective_function,
+                        ts.model,
+                        data,
+                        ps,
+                        ts.states,
+                        ts.optimizer_state,
+                        dparameters,
+                        is_sharded,
+                    )
+                end
+
+            if ts.cache isa TrainingBackendCache
+                @set! ts.cache.dparameters = dparameters
+                @set! ts.cache.extras = merge(
+                    ts.cache.extras, (; compiled_grad_and_step_function, is_sharded)
+                )
+            else
+                cache = TrainingBackendCache(
+                    backend,
+                    False(),
+                    dparameters,
+                    (; compiled_grad_and_step_function, is_sharded),
+                )
+                @set! ts.cache = cache
+            end
+            @set! ts.objective_function = objective_function
         end
 
+        @show typeof(dparameters)
+
         grads, ps, loss, stats, st, opt_state = compiled_grad_and_step_function(
             objective_function,
             ts.model,
             data,
             ps,
             ts.states,
             ts.optimizer_state,
-            dps,
+            dparameters,
             is_sharded,
         )
 
-        cache = TrainingBackendCache(
-            backend, False(), dps, (; compiled_grad_and_step_function, is_sharded)
-        )
-        @set! ts.cache = cache
-        @set! ts.objective_function = objective_function
-        @set! ts.states = st
-        @set! ts.parameters = ps
-        @set! ts.optimizer_state = opt_state
-        @set! ts.step = ts.step + 1
-
-        return grads, loss, stats, ts
-    end
-
-    @eval Profiler.@annotate "Train Step" function Lux.Training.$(fname)(
-        ::ReactantBackend,
-        obj_fn::F,
-        data,
-        ts::Training.TrainState{<:TrainingBackendCache{<:ReactantBackend},F},
-    ) where {F}
-        grads, ps, loss, stats, st, opt_state = ts.cache.extras.compiled_grad_and_step_function(
-            obj_fn,
-            ts.model,
-            data,
-            ts.parameters,
-            ts.states,
-            ts.optimizer_state,
-            ts.cache.dparameters,
-            ts.cache.extras.is_sharded,
-        )
-
         @set! ts.states = st
         @set! ts.parameters = ps
         @set! ts.optimizer_state = opt_state
         @set! ts.step = ts.step + 1
 
         return grads, loss, stats, ts
     end
+end
 
-    @eval function $(internal_fn)(
-        objective_function::F, model, data, ps, st, opt_state, ::Nothing, is_sharded::Bool
-    ) where {F}
-        dps, loss, stats, stₙ = compute_gradients_internal(
-            objective_function, model, data, ps, st
-        )
+@eval function compute_gradients_internal_and_step!(
+    objective_function::F, model, data, ps, st, opt_state, ::Nothing, is_sharded::Bool
+) where {F}
+    dps, loss, stats, stₙ = compute_gradients_internal(
+        objective_function, model, data, ps, st
+    )
 
-        opt_state, psₙ = Optimisers.update!(opt_state, ps, dps)
-        # Ensure sharding of input and output states are consistent
-        is_sharded && mark_same_sharding_group(st, stₙ)
+    opt_state, psₙ = Optimisers.update!(opt_state, ps, dps)
+    # Ensure sharding of input and output states are consistent
+    is_sharded && mark_same_sharding_group(st, stₙ)
 
-        return nothing, psₙ, loss, stats, stₙ, opt_state
-    end
+    return nothing, psₙ, loss, stats, stₙ, opt_state
+end
 
-    @eval function $(internal_fn)(
-        objective_function::F, model, data, ps, st, opt_state, dps, is_sharded::Bool
-    ) where {F}
-        dps, loss, stats, stₙ = compute_gradients_internal!(
-            dps, objective_function, model, data, ps, st
-        )
+@eval function compute_gradients_internal_and_step!(
+    objective_function::F, model, data, ps, st, opt_state, dps, is_sharded::Bool
+) where {F}
+    dps, loss, stats, stₙ = compute_gradients_internal!(
+        dps, objective_function, model, data, ps, st
+    )
 
-        opt_state, psₙ = Optimisers.update!(opt_state, ps, dps)
-        # Ensure sharding of input and output states are consistent
-        is_sharded && mark_same_sharding_group(st, stₙ)
+    opt_state, psₙ = Optimisers.update!(opt_state, ps, dps)
+    # Ensure sharding of input and output states are consistent
+    is_sharded && mark_same_sharding_group(st, stₙ)
 
-        return dps, psₙ, loss, stats, stₙ, opt_state
-    end
+    return dps, psₙ, loss, stats, stₙ, opt_state
 end
 
 mark_same_sharding_group(args...) = Functors.fmap(mark_same_sharding_group_inner, args...)