Prepation for outer backend of SecondOrder (#135)

gdalle · web-flow · commit 2ee1bf11fc8d · 2024-04-03T13:38:14.000+02:00
diff --git a/README.md b/README.md
@@ -40,12 +40,12 @@ We support most of the backends defined by [ADTypes.jl](https://github.com/SciML
 | [Tracker.jl](https://github.com/FluxML/Tracker.jl)                              | `AutoTracker()`                                            |
 | [Zygote.jl](https://github.com/FluxML/Zygote.jl)                                | `AutoZygote()`                                             |
 
-We also support additional (experimental) backends:
+We also provide some experimental backends ourselves:
 
-| backend                                                                          | object                      |
-| :------------------------------------------------------------------------------- | :-------------------------- |
-| [FastDifferentiation.jl](https://github.com/brianguenter/FastDifferentiation.jl) | `AutoFastDifferentiation()` |
-| [Tapir.jl](https://github.com/withbayes/Tapir.jl)                                | `AutoTapir()`               |
+| backend                                                                          | object                                                         |
+| :------------------------------------------------------------------------------- | :------------------------------------------------------------- |
+| [FastDifferentiation.jl](https://github.com/brianguenter/FastDifferentiation.jl) | `AutoFastDifferentiation()`, `AutoSparseFastDifferentiation()` |
+| [Tapir.jl](https://github.com/withbayes/Tapir.jl)                                | `AutoTapir()`                                                  |
 
 ## Example
 
diff --git a/docs/src/overview.md b/docs/src/overview.md
@@ -101,8 +101,7 @@ By default, all the preparation functions return `nothing`.
 We do not make any guarantees on their implementation for each backend, or on the performance gains that can be expected.
 
 !!! warning
-    We haven't yet figured out how to deal with extras for second-order operators, because closures make our life rather complicated.
-    For now, consider that preparation doesn't work there in general, although some individual backends may be okay already.
+    For `SecondOrder` backends, the inner differentiation cannot be prepared at the moment, only the outer one is.
 
 ## FAQ
 
@@ -118,7 +117,6 @@ The sparsity pattern is computed automatically with [Symbolics.jl](https://githu
 
 If you need to work with sparse Hessians, you can use a sparse backend as the _outer_ backend of a `SecondOrder`.
 This means the Hessian is obtained as the sparse Jacobian of the gradient.
-Since preparation does not yet work for second order, the sparsity pattern is currently recomputed every time, so you may not gain much time as things stand.
 
 !!! danger
     Sparsity support is still experimental, use at your own risk.
diff --git a/ext/DifferentiationInterfaceChainRulesCoreExt/DifferentiationInterfaceChainRulesCoreExt.jl b/ext/DifferentiationInterfaceChainRulesCoreExt/DifferentiationInterfaceChainRulesCoreExt.jl
@@ -15,8 +15,9 @@ DI.supports_mutation(::AutoChainRules) = DI.MutationNotSupported()
 DI.mode(::AutoForwardChainRules) = ADTypes.AbstractForwardMode
 DI.mode(::AutoReverseChainRules) = ADTypes.AbstractReverseMode
 
-## Pushforward
+## Pushforward (unused)
 
+#=
 DI.prepare_pushforward(f, ::AutoForwardChainRules, x) = NoPushforwardExtras()
 
 function DI.value_and_pushforward(
@@ -26,6 +27,7 @@ function DI.value_and_pushforward(
     y, new_dy = frule_via_ad(rc, (NoTangent(), dx), f, x)
     return y, new_dy
 end
+=#
 
 ## Pullback
 
diff --git a/ext/DifferentiationInterfaceEnzymeExt/reverse_allocating.jl b/ext/DifferentiationInterfaceEnzymeExt/reverse_allocating.jl
@@ -68,3 +68,15 @@ function DI.gradient!!(f, grad, ::AutoReverseEnzyme, x::AbstractArray, ::NoGradi
     gradient!(Reverse, grad_sametype, f, x)
     return grad_sametype
 end
+
+function DI.value_and_gradient(
+    f, backend::AutoReverseEnzyme, x::AbstractArray, ::NoGradientExtras
+)
+    return DI.value_and_pullback(f, backend, x, one(eltype(x)), NoPullbackExtras())
+end
+
+function DI.value_and_gradient!!(
+    f, grad, backend::AutoReverseEnzyme, x::AbstractArray, ::NoGradientExtras
+)
+    return DI.value_and_pullback!!(f, grad, backend, x, one(eltype(x)), NoPullbackExtras())
+end
diff --git a/ext/DifferentiationInterfaceFastDifferentiationExt/allocating.jl b/ext/DifferentiationInterfaceFastDifferentiationExt/allocating.jl
@@ -122,6 +122,25 @@ function DI.value_and_derivative!!(
     return DI.value_and_derivative(f, backend, x, extras)
 end
 
+function DI.derivative(
+    f,
+    backend::AnyAutoFastDifferentiation,
+    x,
+    extras::FastDifferentiationAllocatingDerivativeExtras,
+)
+    return DI.value_and_derivative(f, backend, x, extras)[2]
+end
+
+function DI.derivative!!(
+    f,
+    der,
+    backend::AnyAutoFastDifferentiation,
+    x,
+    extras::FastDifferentiationAllocatingDerivativeExtras,
+)
+    return DI.derivative(f, backend, x, extras)
+end
+
 ## Jacobian
 
 struct FastDifferentiationAllocatingJacobianExtras{E} <: JacobianExtras
@@ -226,7 +245,7 @@ struct FastDifferentiationHVPExtras{E} <: HVPExtras
     hvp_exe::E
 end
 
-function DI.prepare_hvp(f, ::AnyAutoFastDifferentiation, x)
+function DI.prepare_hvp(f, ::AnyAutoFastDifferentiation, x, v)
     x_var = if x isa Number
         only(make_variables(:x))
     else
diff --git a/ext/DifferentiationInterfaceForwardDiffExt/DifferentiationInterfaceForwardDiffExt.jl b/ext/DifferentiationInterfaceForwardDiffExt/DifferentiationInterfaceForwardDiffExt.jl
@@ -3,7 +3,12 @@ module DifferentiationInterfaceForwardDiffExt
 using ADTypes: AbstractADType, AutoForwardDiff, AutoSparseForwardDiff
 import DifferentiationInterface as DI
 using DifferentiationInterface:
-    DerivativeExtras, GradientExtras, HessianExtras, JacobianExtras, NoPushforwardExtras
+    DerivativeExtras,
+    GradientExtras,
+    HessianExtras,
+    JacobianExtras,
+    NoDerivativeExtras,
+    NoPushforwardExtras
 using ForwardDiff.DiffResults: DiffResults, DiffResult, GradientResult
 using ForwardDiff:
     Chunk,
diff --git a/ext/DifferentiationInterfacePolyesterForwardDiffExt/DifferentiationInterfacePolyesterForwardDiffExt.jl b/ext/DifferentiationInterfacePolyesterForwardDiffExt/DifferentiationInterfacePolyesterForwardDiffExt.jl
@@ -11,7 +11,9 @@ using DifferentiationInterface:
     GradientExtras,
     HessianExtras,
     JacobianExtras,
+    NoDerivativeExtras,
     NoGradientExtras,
+    NoHessianExtras,
     NoJacobianExtras,
     PushforwardExtras
 using DocStringExtensions
diff --git a/ext/DifferentiationInterfacePolyesterForwardDiffExt/allocating.jl b/ext/DifferentiationInterfacePolyesterForwardDiffExt/allocating.jl
@@ -59,38 +59,44 @@ end
 
 ## Gradient
 
-DI.prepare_gradient(f, ::AnyAutoPolyForwardDiff, x) = NoGradientExtras()
+function DI.prepare_gradient(f, backend::AnyAutoPolyForwardDiff, x)
+    return DI.prepare_gradient(f, single_threaded(backend), x)
+end
 
 function DI.value_and_gradient!!(
-    f,
-    grad::AbstractVector,
-    ::AnyAutoPolyForwardDiff{C},
-    x::AbstractVector,
-    ::NoGradientExtras,
+    f, grad, ::AnyAutoPolyForwardDiff{C}, x::AbstractVector, ::GradientExtras
 ) where {C}
     threaded_gradient!(f, grad, x, Chunk{C}())
     return f(x), grad
 end
 
 function DI.gradient!!(
-    f,
-    grad::AbstractVector,
-    ::AnyAutoPolyForwardDiff{C},
-    x::AbstractVector,
-    ::NoGradientExtras,
+    f, grad, ::AnyAutoPolyForwardDiff{C}, x::AbstractVector, ::GradientExtras
 ) where {C}
     threaded_gradient!(f, grad, x, Chunk{C}())
     return grad
 end
 
+function DI.value_and_gradient!!(
+    f, grad, backend::AnyAutoPolyForwardDiff{C}, x::AbstractArray, extras::GradientExtras
+) where {C}
+    return DI.value_and_gradient!!(f, grad, single_threaded(backend), x, extras)
+end
+
+function DI.gradient!!(
+    f, grad, backend::AnyAutoPolyForwardDiff{C}, x::AbstractArray, extras::GradientExtras
+) where {C}
+    return DI.gradient!!(f, grad, single_threaded(backend), x, extras)
+end
+
 function DI.value_and_gradient(
-    f, backend::AnyAutoPolyForwardDiff, x::AbstractVector, extras::NoGradientExtras
+    f, backend::AnyAutoPolyForwardDiff, x::AbstractArray, extras::GradientExtras
 )
     return DI.value_and_gradient!!(f, similar(x), backend, x, extras)
 end
 
 function DI.gradient(
-    f, backend::AnyAutoPolyForwardDiff, x::AbstractVector, extras::NoGradientExtras
+    f, backend::AnyAutoPolyForwardDiff, x::AbstractArray, extras::GradientExtras
 )
     return DI.gradient!!(f, similar(x), backend, x, extras)
 end
diff --git a/ext/DifferentiationInterfaceSparseDiffToolsExt/DifferentiationInterfaceSparseDiffToolsExt.jl b/ext/DifferentiationInterfaceSparseDiffToolsExt/DifferentiationInterfaceSparseDiffToolsExt.jl
@@ -2,7 +2,8 @@ module DifferentiationInterfaceSparseDiffToolsExt
 
 using ADTypes
 import DifferentiationInterface as DI
-using DifferentiationInterface: JacobianExtras, NoHessianExtras, SecondOrder, inner, outer
+using DifferentiationInterface:
+    HessianExtras, JacobianExtras, NoHessianExtras, SecondOrder, inner, outer
 using SparseDiffTools:
     AutoSparseEnzyme,
     JacPrototypeSparsityDetection,
diff --git a/ext/DifferentiationInterfaceSparseDiffToolsExt/allocating.jl b/ext/DifferentiationInterfaceSparseDiffToolsExt/allocating.jl
@@ -2,6 +2,11 @@ struct SparseDiffToolsAllocatingJacobianExtras{C} <: JacobianExtras
     cache::C
 end
 
+struct SparseDiffToolsHessianExtras{C,E} <: HessianExtras
+    inner_gradient_closure::C
+    outer_jacobian_extras::E
+end
+
 for AutoSparse in SPARSE_BACKENDS
     @eval begin
 
@@ -42,18 +47,41 @@ for AutoSparse in SPARSE_BACKENDS
 
         ## Hessian
 
-        DI.prepare_hessian(f, ::SecondOrder{<:$AutoSparse}, x) = NoHessianExtras()
+        function DI.prepare_hessian(f, backend::SecondOrder{<:$AutoSparse}, x)
+            inner_gradient_closure(z) = DI.gradient(f, inner(backend), z)
+            outer_jacobian_extras = DI.prepare_jacobian(
+                inner_gradient_closure, outer(backend), x
+            )
+            return SparseDiffToolsHessianExtras(
+                inner_gradient_closure, outer_jacobian_extras
+            )
+        end
 
-        function DI.hessian(f, backend::SecondOrder{<:$AutoSparse}, x, ::NoHessianExtras)
-            gradient_closure(z) = DI.gradient(f, inner(backend), z)
-            return DI.jacobian(gradient_closure, outer(backend), x)
+        function DI.hessian(
+            f, backend::SecondOrder{<:$AutoSparse}, x, extras::SparseDiffToolsHessianExtras
+        )
+            return DI.jacobian(
+                extras.inner_gradient_closure,
+                outer(backend),
+                x,
+                extras.outer_jacobian_extras,
+            )
         end
 
         function DI.hessian!!(
-            f, hess, backend::SecondOrder{<:$AutoSparse}, x, ::NoHessianExtras
+            f,
+            hess,
+            backend::SecondOrder{<:$AutoSparse},
+            x,
+            extras::SparseDiffToolsHessianExtras,
         )
-            gradient_closure(z) = DI.gradient(f, inner(backend), z)
-            return DI.jacobian!!(gradient_closure, hess, outer(backend), x)
+            return DI.jacobian!!(
+                extras.inner_gradient_closure,
+                hess,
+                outer(backend),
+                x,
+                extras.outer_jacobian_extras,
+            )
         end
     end
 end
diff --git a/lib/DifferentiationInterfaceTest/src/tests/benchmark.jl b/lib/DifferentiationInterfaceTest/src/tests/benchmark.jl
@@ -208,7 +208,7 @@ function run_benchmark!(
     data::Vector{BenchmarkDataRow}, ba::AbstractADType, scen::HVPScenario{false}
 )
     (; f, x, y, dx) = deepcopy(scen)
-    extras = prepare_hvp(f, ba, x)
+    extras = prepare_hvp(f, ba, x, dx)
     bench1 = @be mysimilar(x) hvp!!(f, _, ba, x, dx, extras)
     record!(data, ba, hvp, scen, bench1)
     return nothing
diff --git a/lib/DifferentiationInterfaceTest/src/tests/call_count.jl b/lib/DifferentiationInterfaceTest/src/tests/call_count.jl
@@ -141,7 +141,7 @@ end
 
 function test_call_count(ba::AbstractADType, scen::HVPScenario{false})
     (; f, x, y, dx) = deepcopy(scen)
-    extras = prepare_hvp(CallCounter(f), ba, x)
+    extras = prepare_hvp(CallCounter(f), ba, x, dx)
     cc = CallCounter(f)
     p_in = mysimilar(x)
     hvp!!(cc, p_in, ba, x, dx, extras)
diff --git a/lib/DifferentiationInterfaceTest/src/tests/correctness.jl b/lib/DifferentiationInterfaceTest/src/tests/correctness.jl
@@ -387,7 +387,7 @@ function test_correctness(
     ref_backend,
 )
     (; f, x, dx) = new_scen = deepcopy(scen)
-    extras = prepare_hvp(f, ba, x)
+    extras = prepare_hvp(f, ba, x, dx)
     hvp_true = if ref_backend isa AbstractADType
         hvp(f, ref_backend, x, dx)
     else
diff --git a/lib/DifferentiationInterfaceTest/src/tests/sparsity.jl b/lib/DifferentiationInterfaceTest/src/tests/sparsity.jl
@@ -22,10 +22,10 @@ function test_sparsity(ba::AbstractADType, scen::JacobianScenario{false}; ref_ba
         @test jac4 isa SparseMatrixCSC
     end
     @testset "Sparsity pattern" begin
-        @test nnz(jac1) < length(jac_true)
-        @test nnz(jac2) < length(jac_true)
-        @test nnz(jac3) < length(jac_true)
-        @test nnz(jac4) < length(jac_true)
+        @test nnz(jac1) == nnz(jac_true)
+        @test nnz(jac2) == nnz(jac_true)
+        @test nnz(jac3) == nnz(jac_true)
+        @test nnz(jac4) == nnz(jac_true)
     end
     return nothing
 end
@@ -48,7 +48,7 @@ function test_sparsity(ba::AbstractADType, scen::JacobianScenario{true}; ref_bac
         @test jac1 isa SparseMatrixCSC
     end
     @testset "Sparsity pattern" begin
-        @test nnz(jac1) < length(jac_true)
+        @test nnz(jac1) == nnz(jac_true)
     end
     return nothing
 end
@@ -72,8 +72,8 @@ function test_sparsity(ba::AbstractADType, scen::HessianScenario{false}; ref_bac
         @test hess2 isa SparseMatrixCSC
     end
     @testset "Sparsity pattern" begin
-        @test nnz(hess1) < length(hess_true)
-        @test nnz(hess2) < length(hess_true)
+        @test nnz(hess1) == nnz(hess_true)
+        @test nnz(hess2) == nnz(hess_true)
     end
     return nothing
 end
diff --git a/lib/DifferentiationInterfaceTest/src/tests/type_stability.jl b/lib/DifferentiationInterfaceTest/src/tests/type_stability.jl
@@ -126,7 +126,7 @@ end
 
 function test_jet(ba::AbstractADType, scen::HVPScenario{false};)
     (; f, x, dx) = deepcopy(scen)
-    extras = prepare_hvp(f, ba, x)
+    extras = prepare_hvp(f, ba, x, dx)
     p_in = mysimilar(x)
 
     @test_opt hvp!!(f, p_in, ba, x, dx, extras)
diff --git a/src/backends.jl b/src/backends.jl
@@ -1,14 +1,14 @@
 """
     check_available(backend)
 
-Check whether `backend` is available by trying a scalar-to-scalar derivative.
+Check whether `backend` is available by trying a gradient.
 
 !!! warning
     Might take a while due to compilation time.
 """
 function check_available(backend::AbstractADType)
     try
-        value_and_gradient(abs2, backend, 2.0)
+        value_and_gradient(sum, backend, [1.0])
         return true
     catch exception
         @warn "Backend $backend not available" exception
diff --git a/src/derivative.jl b/src/derivative.jl
diff --git a/src/gradient.jl b/src/gradient.jl
diff --git a/src/hessian.jl b/src/hessian.jl
diff --git a/src/hvp.jl b/src/hvp.jl
diff --git a/src/jacobian.jl b/src/jacobian.jl
diff --git a/src/pullback.jl b/src/pullback.jl
diff --git a/src/pushforward.jl b/src/pushforward.jl
diff --git a/src/second_derivative.jl b/src/second_derivative.jl