Support sparse hessians (#128)

gdalle · web-flow · commit 9fcfadde0e40 · 2024-04-02T18:14:35.000+02:00
* Start sparsity for hessian

* Add sparse hessian

* Remove extras second_derivative

* Improve docs

* Remove Test from docs
diff --git a/Project.toml b/Project.toml
@@ -43,25 +43,25 @@ DifferentiationInterfaceZygoteExt = "Zygote"
 
 [compat]
 ADTypes = "0.2.7"
-AbstractDifferentiation = "0.6"
-ChainRulesCore = "1.19"
-Diffractor = "0.2"
-DocStringExtensions = "0.9"
-Enzyme = "0.11"
-FastDifferentiation = "0.3"
-FillArrays = "1"
-FiniteDiff = "2.22"
-FiniteDifferences = "0.12"
-ForwardDiff = "0.10"
+AbstractDifferentiation = "0.6.2"
+ChainRulesCore = "1.23.0"
+Diffractor = "0.2.6"
+DocStringExtensions = "0.9.3"
+Enzyme = "0.11.20"
+FastDifferentiation = "0.3.7"
+FillArrays = "1.9.3"
+FiniteDiff = "2.23.0"
+FiniteDifferences = "0.12.31"
+ForwardDiff = "0.10.36"
 LinearAlgebra = "1"
-PolyesterForwardDiff = "0.1"
-ReverseDiff = "1.15"
-SparseDiffTools = "2.17"
-Symbolics = "5.27"
-Tapir = "0.1"
+PolyesterForwardDiff = "0.1.1"
+ReverseDiff = "1.15.1"
+SparseDiffTools = "2.17.0"
+Symbolics = "5.27.1"
+Tapir = "0.1.2"
 Test = "1"
-Tracker = "0.2"
-Zygote = "0.6"
+Tracker = "0.2.33"
+Zygote = "0.6.69"
 julia = "1.10"
 
 [extras]
diff --git a/README.md b/README.md
@@ -26,18 +26,19 @@ This package provides a backend-agnostic syntax to differentiate functions of th
 
 We support most of the backends defined by [ADTypes.jl](https://github.com/SciML/ADTypes.jl):
 
-| backend                                                                         | object                                                       |
-| :------------------------------------------------------------------------------ | :----------------------------------------------------------- |
-| [ChainRulesCore.jl](https://github.com/JuliaDiff/ChainRulesCore.jl)             | `AutoChainRules(ruleconfig)`                                 |
-| [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl)                     | `AutoDiffractor()`                                           |
-| [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl)                              | `AutoEnzyme(Enzyme.Forward)` or `AutoEnzyme(Enzyme.Reverse)` |
-| [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl)                     | `AutoFiniteDiff()`                                           |
-| [FiniteDifferences.jl](https://github.com/JuliaDiff/FiniteDifferences.jl)       | `AutoFiniteDifferences(fdm)`                                 |
-| [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl)                   | `AutoForwardDiff()`                                          |
-| [PolyesterForwardDiff.jl](https://github.com/JuliaDiff/PolyesterForwardDiff.jl) | `AutoPolyesterForwardDiff(; chunksize)`                      |
-| [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl)                   | `AutoReverseDiff()`                                          |
-| [Tracker.jl](https://github.com/FluxML/Tracker.jl)                              | `AutoTracker()`                                              |
-| [Zygote.jl](https://github.com/FluxML/Zygote.jl)                                | `AutoZygote()`                                               |
+****| backend                                                                         | object                                                     |
+| :------------------------------------------------------------------------------ | :--------------------------------------------------------- |
+| [ChainRulesCore.jl](https://github.com/JuliaDiff/ChainRulesCore.jl)             | `AutoChainRules(ruleconfig)`                               |
+| [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl)                     | `AutoDiffractor()`                                         |
+| [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl)                              | `AutoEnzyme(Enzyme.Forward)`, `AutoEnzyme(Enzyme.Reverse)` |
+| [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl)                     | `AutoFiniteDiff()`                                         |
+| [FiniteDifferences.jl](https://github.com/JuliaDiff/FiniteDifferences.jl)       | `AutoFiniteDifferences(fdm)`                               |
+| [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl)                   | `AutoForwardDiff()`                                        |
+| [PolyesterForwardDiff.jl](https://github.com/JuliaDiff/PolyesterForwardDiff.jl) | `AutoPolyesterForwardDiff(; chunksize)`                    |
+| [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl)                   | `AutoReverseDiff()`                                        |
+| [SparseDiffTools.jl](https://github.com/JuliaDiff/SparseDiffTools.jl)           | `AutoSparseForwardDiff()`, `AutoSparseFiniteDiff()`        |
+| [Tracker.jl](https://github.com/FluxML/Tracker.jl)                              | `AutoTracker()`                                            |
+| [Zygote.jl](https://github.com/FluxML/Zygote.jl)                                | `AutoZygote()`                                             |
 
 We also support additional (experimental) backends:
 
diff --git a/docs/src/backends.md b/docs/src/backends.md
@@ -63,9 +63,6 @@ AutoZygote
 
 ### Sparse
 
-!!! danger
-    Sparsity support is still experimental, use at your own risk.
-
 ```@docs
 AutoSparseFastDifferentiation
 AutoSparseFiniteDiff
@@ -103,3 +100,22 @@ rows = map(all_backends()) do backend  # hide
 end  # hide
 Markdown.parse(join(vcat(header, subheader, rows...), "\n"))  # hide
 ```
+
+## Hessian support
+
+Only some backends are able to compute Hessians.
+You can use [`check_hessian`](@ref) to check that feature, like we did below:
+
+```@example backends
+header = "| backend | Hessian |"  # hide
+subheader = "|---|---|"  # hide
+rows = map(all_backends()) do backend  # hide
+    "| `$(backend_string(backend))` | $(check_hessian(backend) ? '✅' : '❌') |"  # hide
+end  # hide
+Markdown.parse(join(vcat(header, subheader, rows...), "\n"))  # hide
+```
+
+!!! warning
+    Second-order operators can also be used with a combination of backends inside the [`SecondOrder`](@ref) struct.
+    There are many possible combinations, a lot of which will fail.
+    Due to compilation overhead, we do not currently test them all to display the working ones in the documentation, but we might if users deem it relevant.
diff --git a/docs/src/overview.md b/docs/src/overview.md
@@ -49,10 +49,14 @@ Several variants of each operator are defined:
     # mistakenly keep working with grad_in: NOT OK
     ```
     Note that we don't guarantee `grad_out` will have the same type as `grad_in`.
+    Its type can even depend on the choice of backend.
 
 ## Second order
 
-Second-order differentiation is also supported, with the following operators:
+Second-order differentiation is also supported.
+You can either pick a single backend to do all the work, or combine an "outer" backend with an "inner" backend using the [`SecondOrder`](@ref) struct, like so: `SecondOrder(outer, inner)`.
+
+The available operators are similar to first-order ones:
 
 | operator                    | input  `x`      | output   `y` | result type      | result shape             |
 | --------------------------- | --------------- | ------------ | ---------------- | ------------------------ |
@@ -97,9 +101,24 @@ By default, all the preparation functions return `nothing`.
 We do not make any guarantees on their implementation for each backend, or on the performance gains that can be expected.
 
 !!! warning
-    We haven't fully figured out what must happen when an `extras` object is prepared for a specific operator but then given to a lower-level one (i.e. prepare it for `jacobian` but then give it to `pushforward` inside `jacobian`).
+    We haven't yet figured out how to deal with extras for second-order operators, because closures make our life rather complicated.
+    For now, consider that preparation doesn't work there in general, although some individual backends may be okay already.
+
+## FAQ
 
-## Multiple inputs/outputs
+### Multiple inputs/outputs
 
 Restricting the API to one input and one output has many coding advantages, but it is not very flexible.
 If you need more than that, use [ComponentArrays.jl](https://github.com/jonniedie/ComponentArrays.jl) to wrap several objects inside a single `ComponentVector`.
+
+### Sparsity
+
+If you need to work with sparse Jacobians, you can pick one of the [sparse backends](@ref Sparse) from [ADTypes.jl](https://github.com/SciML/ADTypes.jl).
+The sparsity pattern is computed automatically with [Symbolics.jl](https://github.com/JuliaSymbolics/Symbolics.jl) during the preparation step.
+
+If you need to work with sparse Hessians, you can use a sparse backend as the _outer_ backend of a `SecondOrder`.
+This means the Hessian is obtained as the sparse Jacobian of the gradient.
+Since preparation does not yet work for second order, the sparsity pattern is currently recomputed every time, so you may not gain much time as things stand.
+
+!!! danger
+    Sparsity support is still experimental, use at your own risk.
diff --git a/ext/DifferentiationInterfaceSparseDiffToolsExt/DifferentiationInterfaceSparseDiffToolsExt.jl b/ext/DifferentiationInterfaceSparseDiffToolsExt/DifferentiationInterfaceSparseDiffToolsExt.jl
@@ -2,7 +2,7 @@ module DifferentiationInterfaceSparseDiffToolsExt
 
 using ADTypes
 import DifferentiationInterface as DI
-using DifferentiationInterface: JacobianExtras
+using DifferentiationInterface: JacobianExtras, NoHessianExtras, SecondOrder, inner, outer
 using SparseDiffTools:
     AutoSparseEnzyme,
     JacPrototypeSparsityDetection,
diff --git a/ext/DifferentiationInterfaceSparseDiffToolsExt/allocating.jl b/ext/DifferentiationInterfaceSparseDiffToolsExt/allocating.jl
@@ -39,5 +39,21 @@ for AutoSparse in SPARSE_BACKENDS
         )
             return sparse_jacobian(backend, extras.cache, f, x)
         end
+
+        ## Hessian
+
+        DI.prepare_hessian(f, ::SecondOrder{<:$AutoSparse}, x) = NoHessianExtras()
+
+        function DI.hessian(f, backend::SecondOrder{<:$AutoSparse}, x, ::NoHessianExtras)
+            gradient_closure(z) = DI.gradient(f, inner(backend), z)
+            return DI.jacobian(gradient_closure, outer(backend), x)
+        end
+
+        function DI.hessian!!(
+            f, hess, backend::SecondOrder{<:$AutoSparse}, x, ::NoHessianExtras
+        )
+            gradient_closure(z) = DI.gradient(f, inner(backend), z)
+            return DI.jacobian!!(gradient_closure, hess, outer(backend), x)
+        end
     end
 end
diff --git a/lib/DifferentiationInterfaceTest/src/tests/sparsity.jl b/lib/DifferentiationInterfaceTest/src/tests/sparsity.jl
@@ -1,3 +1,5 @@
+## Jacobian
+
 function test_sparsity(ba::AbstractADType, scen::JacobianScenario{false}; ref_backend)
     (; f, x, y) = new_scen = deepcopy(scen)
     extras = prepare_jacobian(f, ba, x)
@@ -50,3 +52,28 @@ function test_sparsity(ba::AbstractADType, scen::JacobianScenario{true}; ref_bac
     end
     return nothing
 end
+
+## Hessian
+
+function test_sparsity(ba::AbstractADType, scen::HessianScenario{false}; ref_backend)
+    (; f, x, y) = new_scen = deepcopy(scen)
+    extras = prepare_hessian(f, ba, x)
+    hess_true = if ref_backend isa AbstractADType
+        hessian(f, ref_backend, x)
+    else
+        new_scen.ref(x)
+    end
+
+    hess1 = hessian(f, ba, x, extras)
+    hess2 = hessian!!(f, mysimilar(hess_true), ba, x, extras)
+
+    @testset "Sparse type" begin
+        @test hess1 isa SparseMatrixCSC
+        @test hess2 isa SparseMatrixCSC
+    end
+    @testset "Sparsity pattern" begin
+        @test nnz(hess1) < length(hess_true)
+        @test nnz(hess2) < length(hess_true)
+    end
+    return nothing
+end
diff --git a/src/hvp.jl b/src/hvp.jl
@@ -47,46 +47,30 @@ end
 
 function hvp_aux(f, backend, x, v, extras, ::ForwardOverReverse)
     # JVP of the gradient
-    function gradient_closure(z)
-        inner_extras = prepare_gradient(extras, f, inner(backend), z)
-        return gradient(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_pushforward(extras, gradient_closure, outer(backend), x)
-    p = pushforward(gradient_closure, outer(backend), x, v, outer_extras)
+    gradient_closure(z) = gradient(f, inner(backend), z)
+    p = pushforward(gradient_closure, outer(backend), x, v)
     return p
 end
 
 function hvp_aux(f, backend, x, v, extras, ::ReverseOverForward)
     # gradient of the JVP
-    function jvp_closure(z)
-        inner_extras = prepare_pushforward(extras, f, inner(backend), z)
-        return pushforward(f, inner(backend), z, v, inner_extras)
-    end
-    outer_extras = prepare_gradient(extras, jvp_closure, outer(backend), x)
-    p = gradient(jvp_closure, outer(backend), x, outer_extras)
+    pushforward_closure(z) = pushforward(f, inner(backend), z, v)
+    p = gradient(pushforward_closure, outer(backend), x)
     return p
 end
 
 function hvp_aux(f, backend, x, v, extras, ::ReverseOverReverse)
     # VJP of the gradient
-    function gradient_closure(z)
-        inner_extras = prepare_gradient(extras, f, inner(backend), z)
-        return gradient(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_pullback(extras, gradient_closure, outer(backend), x)
-    p = pullback(gradient_closure, outer(backend), x, v, outer_extras)
+    gradient_closure(z) = gradient(f, inner(backend), z)
+    p = pullback(gradient_closure, outer(backend), x, v)
     return p
 end
 
 function hvp_aux(f, backend, x, v, extras, ::ForwardOverForward)
     # JVPs of JVPs in theory
     # also pushforward of gradient in practice
-    function gradient_closure(z)
-        inner_extras = prepare_gradient(extras, f, inner(backend), z)
-        return gradient(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_pushforward(extras, gradient_closure, outer(backend), x)
-    p = pushforward(gradient_closure, outer(backend), x, v, outer_extras)
+    gradient_closure(z) = gradient(f, inner(backend), z)
+    p = pushforward(gradient_closure, outer(backend), x, v)
     return p
 end
 
@@ -108,41 +92,25 @@ function hvp!!(
 end
 
 function hvp_aux!!(f, p, backend, x, v, extras, ::ForwardOverReverse)
-    function gradient_closure(z)
-        inner_extras = prepare_gradient(extras, f, inner(backend), z)
-        return gradient(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_pushforward(extras, gradient_closure, outer(backend), x)
-    p = pushforward!!(gradient_closure, p, outer(backend), x, v, outer_extras)
+    gradient_closure(z) = gradient(f, inner(backend), z)
+    p = pushforward!!(gradient_closure, p, outer(backend), x, v)
     return p
 end
 
 function hvp_aux!!(f, p, backend, x, v, extras, ::ReverseOverForward)
-    function jvp_closure(z)
-        inner_extras = prepare_pushforward(extras, f, inner(backend), z)
-        return pushforward(f, inner(backend), z, v, inner_extras)
-    end
-    outer_extras = prepare_gradient(extras, jvp_closure, outer(backend), x)
-    p = gradient!!(jvp_closure, p, outer(backend), x, outer_extras)
+    pushforward_closure(z) = pushforward(f, inner(backend), z, v)
+    p = gradient!!(pushforward_closure, p, outer(backend), x)
     return p
 end
 
 function hvp_aux!!(f, p, backend, x, v, extras, ::ReverseOverReverse)
-    function gradient_closure(z)
-        inner_extras = prepare_gradient(extras, f, inner(backend), z)
-        return gradient(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_pullback(extras, gradient_closure, outer(backend), x)
-    p = pullback!!(gradient_closure, p, outer(backend), x, v, outer_extras)
+    gradient_closure(z) = gradient(f, inner(backend), z)
+    p = pullback!!(gradient_closure, p, outer(backend), x, v)
     return p
 end
 
 function hvp_aux!!(f, p, backend, x, v, extras, ::ForwardOverForward)
-    function gradient_closure(z)
-        inner_extras = prepare_gradient(extras, f, inner(backend), z)
-        return gradient(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_pushforward(extras, gradient_closure, outer(backend), x)
-    p = pushforward!!(gradient_closure, p, outer(backend), x, v, outer_extras)
+    gradient_closure(z) = gradient(f, inner(backend), z)
+    p = pushforward!!(gradient_closure, p, outer(backend), x, v)
     return p
 end
diff --git a/src/second_derivative.jl b/src/second_derivative.jl
@@ -44,12 +44,8 @@ function second_derivative(
     x,
     extras::SecondDerivativeExtras=prepare_second_derivative(f, backend, x),
 )
-    function derivative_closure(z)
-        inner_extras = prepare_derivative(extras, f, inner(backend), z)
-        return derivative(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_derivative(extras, derivative_closure, outer(backend), x)
-    der2 = derivative(derivative_closure, outer(backend), x, outer_extras)
+    derivative_closure(z) = derivative(f, inner(backend), z)
+    der2 = derivative(derivative_closure, outer(backend), x)
     return der2
 end
 
@@ -75,12 +71,8 @@ function second_derivative!!(
     x,
     extras::SecondDerivativeExtras=prepare_second_derivative(f, backend, x),
 )
-    function derivative_closure(z)
-        inner_extras = prepare_derivative(extras, f, inner(backend), z)
-        return derivative(f, inner(backend), z, inner_extras)
-    end
-    outer_extras = prepare_derivative(extras, derivative_closure, outer(backend), x)
-    der2 = derivative!!(derivative_closure, der2, outer(backend), x, outer_extras)
+    derivative_closure(z) = derivative(f, inner(backend), z)
+    der2 = derivative!!(derivative_closure, der2, outer(backend), x)
     return der2
 end
 
diff --git a/src/second_order.jl b/src/second_order.jl
@@ -11,11 +11,11 @@ Combination of two backends for second-order differentiation.
 
 $(TYPEDFIELDS)
 """
-struct SecondOrder{AD1<:AbstractADType,AD2<:AbstractADType} <: AbstractADType
+struct SecondOrder{ADO<:AbstractADType,ADI<:AbstractADType} <: AbstractADType
     "backend for the outer differentiation"
-    outer::AD1
+    outer::ADO
     "backend for the inner differentiation"
-    inner::AD2
+    inner::ADI
 end
 
 SecondOrder(backend::AbstractADType) = SecondOrder(backend, backend)
diff --git a/test/sparsity.jl b/test/sparsity.jl