feat: use a caching allocator for GPUArrays workflows (#1549)

avik-pal · web-flow · commit 654bd34a99f9 · 2025-11-13T10:32:19.000-05:00
* feat: use a caching allocator for GPUArrays workflows

* fix: switch arg position

* fix: GPUArrays compat

* fix: other device types
diff --git a/Project.toml b/Project.toml
@@ -43,6 +43,7 @@ ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
@@ -64,6 +65,7 @@ WeightInitializers = {path = "lib/WeightInitializers"}
 LuxComponentArraysExt = "ComponentArrays"
 LuxEnzymeExt = "Enzyme"
 LuxFluxExt = "Flux"
+LuxGPUArraysExt = "GPUArrays"
 LuxLossFunctionsExt = "LossFunctions"
 LuxMLUtilsExt = "MLUtils"
 LuxMPIExt = "MPI"
@@ -93,6 +95,7 @@ Flux = "0.16.3"
 ForwardDiff = "0.10.36, =1"
 FunctionWrappers = "1.1.3"
 Functors = "0.5"
+GPUArrays = "11"
 GPUArraysCore = "0.2"
 LinearAlgebra = "1.10"
 LossFunctions = "0.11.1, 1"
diff --git a/ext/LuxGPUArraysExt.jl b/ext/LuxGPUArraysExt.jl
@@ -0,0 +1,44 @@
+module LuxGPUArraysExt
+
+using GPUArrays: AllocCache, @cached
+using Lux: Training
+using MLDataDevices: AbstractGPUDevice
+
+Training.get_allocator_cache(::AbstractGPUDevice) = AllocCache()
+
+function Training.compute_gradients_impl_with_allocator_cache(
+    backend, alloc_cache::AllocCache, obj_fn::F, data, ts::Training.TrainState
+) where {F}
+    @cached alloc_cache begin
+        return Training.compute_gradients_impl(backend, obj_fn, data, ts)
+    end
+end
+
+for inplace in ("!", "")
+    step_with_alloc_cache = Symbol(:single_train_step_impl_with_allocator_cache, inplace)
+    step_inner = Symbol(:single_train_step_impl, inplace)
+    apply_gradients_with_alloc_cache = Symbol(
+        :apply_gradients_with_allocator_cache, inplace
+    )
+    apply_fn = Symbol(:apply_gradients_impl, inplace)
+
+    @eval begin
+        function Training.$(apply_gradients_with_alloc_cache)(
+            alloc_cache::AllocCache, ts::Training.TrainState, grads
+        )
+            @cached alloc_cache begin
+                return Training.$(apply_fn)(ts, grads)
+            end
+        end
+
+        function Training.$(step_with_alloc_cache)(
+            backend, alloc_cache::AllocCache, obj_fn::F, data, ts::Training.TrainState
+        ) where {F}
+            @cached alloc_cache begin
+                return Training.$(step_inner)(backend, obj_fn, data, ts)
+            end
+        end
+    end
+end
+
+end
diff --git a/src/helpers/training.jl b/src/helpers/training.jl
@@ -31,6 +31,7 @@ Training State containing:
 Internal fields:
 
   - `cache`: Cached values. Implementations are free to use this for whatever they want.
+  - `allocator_cache`: Used by GPUArrays compatible backends to cache memory allocations.
   - `objective_function`: Objective function might be cached.
 
 !!! warning
@@ -41,6 +42,7 @@ Internal fields:
 @concrete struct TrainState
     cache
     objective_function
+    allocator_cache
     model
     parameters
     states
@@ -55,6 +57,7 @@ function Adapt.adapt_structure(to::AbstractDevice, ts::TrainState)
     return TrainState(
         nothing,
         nothing,
+        get_allocator_cache(to),
         ts.model,
         to(ts.parameters),
         to(ts.states),
@@ -91,16 +94,7 @@ function Adapt.adapt_structure(to::ReactantDevice, ts::TrainState)
     This ensures the optimizer state and other internal states are on the device on
     construction.
     """
-    return TrainState(
-        nothing,
-        nothing,
-        ts.model,
-        to(ts.parameters),
-        to(ts.states),
-        ts.optimizer,
-        to(ts.optimizer_state),
-        ts.step,
-    )
+    return @invoke Adapt.adapt_structure(to::AbstractDevice, ts::TrainState)
 end
 
 """
@@ -125,9 +119,13 @@ function TrainState(model::AbstractLuxLayer, ps, st, optimizer::Optimisers.Abstr
         optimizer = ReactantCompatibleOptimisers.make_reactant_compatible(optimizer, dev)
     end
     st_opt = Optimisers.setup(optimizer, ps)
-    return TrainState(nothing, nothing, model, ps, st, optimizer, st_opt, 0)
+    return TrainState(
+        nothing, nothing, get_allocator_cache(dev), model, ps, st, optimizer, st_opt, 0
+    )
 end
 
+get_allocator_cache(_) = nothing
+
 @concrete struct TrainingBackendCache
     backend
     first_try <: StaticBool
@@ -190,14 +188,25 @@ function apply_gradients(ts::TrainState, grads)
     )
         return apply_gradients_reactant(ts, grads)
     end
+    return apply_gradients_with_allocator_cache(ts.allocator_cache, ts, grads)
+end
+
+# apply_gradients -> apply_gradients_reactant (for ReactantBackend)
+#                 -> apply_gradients_with_allocator_cache -> apply_gradients_impl
+
+function apply_gradients_with_allocator_cache(::Nothing, ts::TrainState, grads)
+    return apply_gradients_impl(ts, grads)
+end
+
+function apply_gradients_impl(ts::TrainState, grads)
     optimizer_state, ps = Optimisers.update(ts.optimizer_state, ts.parameters, grads)
     @set! ts.parameters = ps
     @set! ts.optimizer_state = optimizer_state
     @set! ts.step = ts.step + 1
     return ts
 end
 
-function apply_gradients_reactant end
+function apply_gradients_reactant end # updated in ReactantExt
 
 """
     apply_gradients!(ts::TrainState, grads)
@@ -214,12 +223,23 @@ function apply_gradients!(ts::TrainState, grads)
     )
         return apply_gradients_reactant!(ts, grads)
     end
+    return apply_gradients_with_allocator_cache!(ts.allocator_cache, ts, grads)
+end
+
+# apply_gradients! -> apply_gradients_reactant! (for ReactantBackend)
+#                  -> apply_gradients_with_allocator_cache! -> apply_gradients_impl!
+
+function apply_gradients_with_allocator_cache!(::Nothing, ts::TrainState, grads)
+    return apply_gradients_impl!(ts, grads)
+end
+
+function apply_gradients_impl!(ts::TrainState, grads)
     Optimisers.update!(ts.optimizer_state, ts.parameters, grads)
     @set! ts.step = ts.step + 1
     return ts
 end
 
-function apply_gradients_reactant! end
+function apply_gradients_reactant! end # updated in ReactantExt
 
 const SYNC_DOCSTRING = """
   - `sync`: If `true`, then the compiled reactant function is compiled with `sync=true`.
@@ -288,20 +308,17 @@ A 4-Tuple containing:
 """
 function compute_gradients(ad, obj_fn::F, data, ts::TrainState; sync::Bool=false) where {F}
     dev_type = get_device_type((ts.parameters, ts.states))
-    return compute_gradients_impl(maybe_wrap_adtype(ad, dev_type; sync), obj_fn, data, ts)
+    return compute_gradients_impl_with_allocator_cache(
+        maybe_wrap_adtype(ad, dev_type; sync), ts.allocator_cache, obj_fn, data, ts
+    )
 end
 
-maybe_wrap_adtype(backend::ReactantBackend, ::Any; kwargs...) = backend
-maybe_wrap_adtype(ad::AbstractADType, ::Any; kwargs...) = ad
-function maybe_wrap_adtype(
-    ad::AbstractADType,
-    ::Type{ReactantDevice};
-    return_gradients::Utils.BoolType=True(),
-    sync::Bool=false,
-)
-    ad isa AutoEnzyme && return ReactantBackend(static(return_gradients), sync)
-    throw(ArgumentError("Computing gradients for models on XLA is supported only with \
-                         Enzyme.jl (`AutoEnzyme`)."))
+# compute_gradients -> compute_gradients_impl_with_allocator_cache -> compute_gradients_impl
+
+function compute_gradients_impl_with_allocator_cache(
+    backend, ::Nothing, obj_fn::F, data, ts::TrainState
+) where {F}
+    return compute_gradients_impl(backend, obj_fn, data, ts)
 end
 
 function compute_gradients_impl(ad, ::F, _, ts::TrainState) where {F}
@@ -328,6 +345,19 @@ for package in (:Zygote, :Tracker, :ReverseDiff, :Enzyme, :Mooncake)
     end
 end
 
+maybe_wrap_adtype(backend::ReactantBackend, ::Any; kwargs...) = backend
+maybe_wrap_adtype(ad::AbstractADType, ::Any; kwargs...) = ad
+function maybe_wrap_adtype(
+    ad::AbstractADType,
+    ::Type{ReactantDevice};
+    return_gradients::Utils.BoolType=True(),
+    sync::Bool=false,
+)
+    ad isa AutoEnzyme && return ReactantBackend(static(return_gradients), sync)
+    throw(ArgumentError("Computing gradients for models on XLA is supported only with \
+                         Enzyme.jl (`AutoEnzyme`)."))
+end
+
 function generate_wrappers(::F, m, ps, st, data, ::False) where {F}
     @warn "Detected function wrapper generation with function being updated between calls. \
            This will generate type-unstable code. A possible reason for this is \
@@ -395,7 +425,9 @@ function single_train_step!(
     backend = maybe_wrap_adtype(
         backend, get_device_type((ts.parameters, ts.states)); return_gradients, sync
     )
-    return single_train_step_impl!(backend, obj_fn, data, ts)
+    return single_train_step_impl_with_allocator_cache!(
+        backend, ts.allocator_cache, obj_fn, data, ts
+    )
 end
 
 """
@@ -429,16 +461,29 @@ function single_train_step(
     backend = maybe_wrap_adtype(
         backend, get_device_type((ts.parameters, ts.states)); return_gradients, sync
     )
-    return single_train_step_impl(backend, obj_fn, data, ts)
+    return single_train_step_impl_with_allocator_cache(
+        backend, ts.allocator_cache, obj_fn, data, ts
+    )
 end
 
+# single_train_step -> single_train_step_impl_with_allocator_cache -> single_train_step_impl
+
 for inplace in ("!", "")
     step = Symbol(:single_train_step_impl, inplace)
+    step_allocator_cache = Symbol(:single_train_step_impl_with_allocator_cache, inplace)
     apply_fn = Symbol(:apply_gradients, inplace)
-    @eval function $(step)(backend, obj_fn::F, data, ts::TrainState) where {F}
-        grads, loss, stats, ts = compute_gradients(backend, obj_fn, data, ts)
-        ts = $(apply_fn)(ts, grads)
-        return grads, loss, stats, ts
+    @eval begin
+        function $(step_allocator_cache)(
+            backend, ::Nothing, obj_fn::F, data, ts::TrainState
+        ) where {F}
+            return $(step)(backend, obj_fn, data, ts)
+        end
+
+        function $(step)(backend, obj_fn::F, data, ts::TrainState) where {F}
+            grads, loss, stats, ts = compute_gradients(backend, obj_fn, data, ts)
+            ts = $(apply_fn)(ts, grads)
+            return grads, loss, stats, ts
+        end
     end
 end