refactor: use Lux primitives for AD (#995)

avik-pal · web-flow · commit 4379ec39bba8 · 2024-10-28T19:36:16.000-04:00
* refactor: use Lux primitives for AD * fix: workaround SciML/Optimization.jl#848
diff --git a/examples/Basics/main.jl b/examples/Basics/main.jl
@@ -214,50 +214,25 @@ f(x) = x .* x ./ 2
 x = randn(rng, Float32, 5)
 v = ones(Float32, 5)
 
-# Construct the pushforward function. We will write out the function here but in
-# practice we recommend using
-# [SparseDiffTools.auto_jacvec](https://docs.sciml.ai/SparseDiffTools/stable/#Jacobian-Vector-and-Hessian-Vector-Products)!
-
-# First we need to create a Tag for ForwardDiff. It is enough to know that this is something
-# that you must do. For more details, see the
-# [ForwardDiff Documentation](https://juliadiff.org/ForwardDiff.jl/dev/user/advanced/#Custom-tags-and-tag-checking)!
-struct TestTag end
-
-# Going in the details of what is function is doing is beyond the scope of this tutorial.
-# But in short, it is constructing a new Dual Vector with the partials set to the input
-# to the pushforward function. When this is propagated through the original function
-# we get the value and the jvp
-function pushforward_forwarddiff(f, x)
-    T = eltype(x)
-    function pushforward(v)
-        v_ = reshape(v, axes(x))
-        y = ForwardDiff.Dual{
-            ForwardDiff.Tag{TestTag, T}, T, 1}.(x, ForwardDiff.Partials.(tuple.(v_)))
-        res = vec(f(y))
-        return ForwardDiff.value.(res), vec(ForwardDiff.partials.(res, 1))
-    end
-    return pushforward
-end
-
-pf_f = pushforward_forwarddiff(f, x)
+# !!! warning "Using DifferentiationInterface"
+#
+#     While DifferentiationInterface provides these functions for a wider range of backends,
+#     we currently don't recommend using them with Lux models, since the functions presented
+#     here come with additional goodies like
+#     [fast second-order derivatives](@ref nested_autodiff).
 
-# Compute the jvp.
+# Compute the jvp. `AutoForwardDiff` specifies that we want to use `ForwardDiff.jl` for the
+# Jacobian-Vector Product
 
-val, jvp = pf_f(v)
-println("Computed Value: f(", x, ") = ", val)
-println("JVP: ", jvp[1])
+jvp = jacobian_vector_product(f, AutoForwardDiff(), x, v)
+println("JVP: ", jvp)
 
 # ### Vector-Jacobian Product
 
 # Using the same function and inputs, let us compute the VJP.
 
-val, pb_f = Zygote.pullback(f, x)
-
-# Compute the vjp.
-
-vjp = only(pb_f(v))
-println("Computed Value: f(", x, ") = ", val)
-println("VJP: ", vjp[1])
+vjp = vector_jacobian_product(f, AutoZygote(), x, v)
+println("VJP: ", vjp)
 
 # ## Linear Regression
 
diff --git a/examples/NeuralODE/main.jl b/examples/NeuralODE/main.jl
@@ -1,7 +1,8 @@
 # # MNIST Classification using Neural ODEs
 
 # To understand Neural ODEs, users should look up
-# [these lecture notes](https://book.sciml.ai/notes/11-Differentiable_Programming_and_Neural_Differential_Equations/). We recommend users to directly use
+# [these lecture notes](https://book.sciml.ai/notes/11-Differentiable_Programming_and_Neural_Differential_Equations/).
+# We recommend users to directly use
 # [DiffEqFlux.jl](https://docs.sciml.ai/DiffEqFlux/stable/), instead of implementing
 # Neural ODEs from scratch.
 
@@ -31,7 +32,8 @@ function loadmnist(batchsize, train_split)
         ## Use DataLoader to automatically minibatch and shuffle the data
         DataLoader(collect.((x_train, y_train)); batchsize, shuffle=true),
         ## Don't shuffle the test data
-        DataLoader(collect.((x_test, y_test)); batchsize, shuffle=false))
+        DataLoader(collect.((x_test, y_test)); batchsize, shuffle=false)
+    )
 end
 
 # ## Define the Neural ODE Layer
diff --git a/examples/OptimizationIntegration/main.jl b/examples/OptimizationIntegration/main.jl
@@ -114,7 +114,7 @@ function train_model(dataloader)
     opt_prob = OptimizationProblem(opt_func, ps_ca, dataloader)
 
     epochs = 25
-    res_adam = solve(opt_prob, Optimisers.Adam(0.001); callback, maxiters=epochs)
+    res_adam = solve(opt_prob, Optimisers.Adam(0.001); callback, epochs)
 
     ## Let's finetune a bit with L-BFGS
     opt_prob = OptimizationProblem(opt_func, res_adam.u, (gdev(ode_data), TimeWrapper(t)))