JuliaDiff
diff --git a/‎DifferentiationInterface/Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎DifferentiationInterface/Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎DifferentiationInterface/docs/src/explanation/advanced.md‎
Lines changed: 20 additions & 0 deletions b/‎DifferentiationInterface/docs/src/explanation/advanced.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/DifferentiationInterfaceEnzymeExt.jl‎
Lines changed: 2 additions & 2 deletions b/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/DifferentiationInterfaceEnzymeExt.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl‎
Lines changed: 2 additions & 2 deletions b/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl‎
Lines changed: 1 addition & 1 deletion b/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl‎
Lines changed: 11 additions & 3 deletions b/‎DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfaceForwardDiffExt/DifferentiationInterfaceForwardDiffExt.jl‎
Lines changed: 1 addition & 18 deletions b/‎DifferentiationInterface/ext/DifferentiationInterfaceForwardDiffExt/DifferentiationInterfaceForwardDiffExt.jl‎
Lines changed: 1 addition & 18 deletions
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfaceForwardDiffExt/utils.jl‎
Lines changed: 24 additions & 0 deletions b/‎DifferentiationInterface/ext/DifferentiationInterfaceForwardDiffExt/utils.jl‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfacePolyesterForwardDiffExt/DifferentiationInterfacePolyesterForwardDiffExt.jl‎
Lines changed: 3 additions & 3 deletions b/‎DifferentiationInterface/ext/DifferentiationInterfacePolyesterForwardDiffExt/DifferentiationInterfacePolyesterForwardDiffExt.jl‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎DifferentiationInterface/ext/DifferentiationInterfaceSparseMatrixColoringsExt/DifferentiationInterfaceSparseMatrixColoringsExt.jl‎
Lines changed: 4 additions & 3 deletions b/‎DifferentiationInterface/ext/DifferentiationInterfaceSparseMatrixColoringsExt/DifferentiationInterfaceSparseMatrixColoringsExt.jl‎
Lines changed: 4 additions & 3 deletions
@@ -1,7 +1,7 @@
 name = "DifferentiationInterface"
 uuid = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 authors = ["Guillaume Dalle", "Adrian Hill"]
-version = "0.6.12"
+version = "0.6.13"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 
@@ -67,3 +67,23 @@ The complexity of sparse Jacobians or Hessians grows with the number of distinct
 To reduce this number of colors, [`GreedyColoringAlgorithm`](@ref) has two main settings: the order used for vertices and the decompression method.
 Depending on your use case, you may want to modify either of these options to increase performance.
 See the documentation of [SparseMatrixColorings.jl](https://github.com/gdalle/SparseMatrixColorings.jl) for details.
+
+## Batch mode
+
+### Multiple tangents
+
+The [`jacobian`](@ref) and [`hessian`](@ref) operators compute matrices by repeatedly applying lower-level operators ([`pushforward`](@ref), [`pullback`](@ref) or [`hvp`](@ref)) to a set of tangents.
+The tangents usually correspond to basis elements of the appropriate vector space.
+We could call the lower-level operator on each tangent separately, but some packages ([ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) and [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl)) have optimized implementations to handle multiple tangents at once.
+
+This behavior is often called "vector mode" AD, but we call it "batch mode" to avoid confusion with Julia's `Vector` type.
+As a matter of fact, the optimal batch size $B$ (number of simultaneous tangents) is usually very small, so tangents are passed within an `NTuple` and not a `Vector`.
+When the underlying vector space has dimension $N$, the operators `jacobian` and `hessian` process $\lceil N / B \rceil$ batches of size $B$ each.
+
+### Optimal batch size
+
+For every backend which does not support batch mode, the batch size is set to $B = 1$.
+But for [`AutoForwardDiff`](@extref ADTypes.AutoForwardDiff) and [`AutoEnzyme`](@extref ADTypes.AutoEnzyme), more complicated rules apply.
+If the backend object has a pre-determined batch size $B_0$, then we always set $B = B_0$.
+In particular, this will throw errors when $N < B_0$.
+On the other hand, without a pre-determined batch size, we apply backend-specific heuristics to pick $B$ based on $N$.
@@ -16,12 +16,12 @@ using DifferentiationInterface:
     NoHVPPrep,
     NoJacobianPrep,
     NoPullbackPrep,
-    NoPushforwardPrep,
-    pick_batchsize
+    NoPushforwardPrep
 using Enzyme:
     Active,
     Annotation,
     BatchDuplicated,
+    BatchMixedDuplicated,
     Const,
     Duplicated,
     DuplicatedNoNeed,
 
@@ -121,7 +121,7 @@ end
 function DI.prepare_gradient(
     f::F, backend::AutoEnzyme{<:ForwardMode,<:Union{Nothing,Const}}, x
 ) where {F}
-    valB = pick_batchsize(backend, length(x))
+    valB = to_val(DI.pick_batchsize(backend, x))
     shadows = create_shadows(valB, x)
     return EnzymeForwardGradientPrep(valB, shadows)
 end
@@ -190,7 +190,7 @@ function DI.prepare_jacobian(
     f::F, backend::AutoEnzyme{<:Union{ForwardMode,Nothing},<:Union{Nothing,Const}}, x
 ) where {F}
     y = f(x)
-    valB = pick_batchsize(backend, length(x))
+    valB = to_val(DI.pick_batchsize(backend, x))
     shadows = create_shadows(valB, x)
     return EnzymeForwardOneArgJacobianPrep(valB, shadows, length(y))
 end
 
@@ -337,7 +337,7 @@ end
 function DI.prepare_jacobian(f::F, backend::AutoEnzyme{<:ReverseMode,Nothing}, x) where {F}
     y = f(x)
     Sy = size(y)
-    valB = pick_batchsize(backend, prod(Sy))
+    valB = to_val(DI.pick_batchsize(backend, y))
     return EnzymeReverseOneArgJacobianPrep(Val(Sy), valB)
 end
 
 
@@ -1,5 +1,12 @@
 # until https://github.com/EnzymeAD/Enzyme.jl/pull/1545 is merged
-DI.pick_batchsize(::AutoEnzyme, dimension::Integer) = Val(min(dimension, 16))
+function DI.BatchSizeSettings(::AutoEnzyme, N::Integer)
+    B = DI.reasonable_batchsize(N, 16)
+    singlebatch = B == N
+    aligned = N % B == 0
+    return DI.BatchSizeSettings{B,singlebatch,aligned}(N)
+end
+
+to_val(::DI.BatchSizeSettings{B}) where {B} = Val(B)
 
 ## Annotations
 
@@ -17,9 +24,10 @@ function get_f_and_df(
         M,
         <:Union{
             Duplicated,
-            EnzymeCore.DuplicatedNoNeed,
+            MixedDuplicated,
             BatchDuplicated,
-            EnzymeCore.BatchDuplicatedFunc,
+            BatchMixedDuplicated,
+            EnzymeCore.DuplicatedNoNeed,
             EnzymeCore.BatchDuplicatedNoNeed,
         },
     },
 
@@ -4,6 +4,7 @@ using ADTypes: AbstractADType, AutoForwardDiff
 using Base: Fix1, Fix2
 import DifferentiationInterface as DI
 using DifferentiationInterface:
+    BatchSizeSettings,
     Context,
     DerivativePrep,
     DifferentiateWith,
@@ -49,24 +50,6 @@ using LinearAlgebra: dot, mul!
 
 DI.check_available(::AutoForwardDiff) = true
 
-function DI.pick_batchsize(
-    ::AutoForwardDiff{chunksize}, dimension::Integer
-) where {chunksize}
-    return Val{chunksize}()
-end
-
-function DI.pick_batchsize(::AutoForwardDiff{nothing}, dimension::Integer)
-    # type-unstable
-    return Val(ForwardDiff.pickchunksize(dimension))
-end
-
-function DI.threshold_batchsize(
-    backend::AutoForwardDiff{chunksize1}, chunksize2::Integer
-) where {chunksize1}
-    chunksize = (chunksize1 === nothing) ? nothing : min(chunksize1, chunksize2)
-    return AutoForwardDiff(; chunksize, tag=backend.tag)
-end
-
 include("utils.jl")
 include("onearg.jl")
 include("twoarg.jl")
 
@@ -1,3 +1,27 @@
+function DI.BatchSizeSettings(::AutoForwardDiff{nothing}, N::Integer)
+    B = ForwardDiff.pickchunksize(N)
+    singlebatch = B == N
+    aligned = N % B == 0
+    return BatchSizeSettings{B,singlebatch,aligned}(N)
+end
+
+function DI.BatchSizeSettings(::AutoForwardDiff{chunksize}, N::Integer) where {chunksize}
+    if chunksize > N
+        throw(ArgumentError("Fixed chunksize $chunksize larger than input size $N"))
+    end
+    B = chunksize
+    singlebatch = B == N
+    aligned = N % B == 0
+    return BatchSizeSettings{B,singlebatch,aligned}(N)
+end
+
+function DI.threshold_batchsize(
+    backend::AutoForwardDiff{chunksize1}, chunksize2::Integer
+) where {chunksize1}
+    chunksize = isnothing(chunksize1) ? nothing : min(chunksize1, chunksize2)
+    return AutoForwardDiff(; chunksize, tag=backend.tag)
+end
+
 choose_chunk(::AutoForwardDiff{nothing}, x) = Chunk(x)
 choose_chunk(::AutoForwardDiff{chunksize}, x) where {chunksize} = Chunk{chunksize}()
 
 
@@ -28,14 +28,14 @@ end
 
 DI.check_available(::AutoPolyesterForwardDiff) = true
 
-function DI.pick_batchsize(backend::AutoPolyesterForwardDiff, dimension::Integer)
-    return DI.pick_batchsize(single_threaded(backend), dimension)
+function DI.BatchSizeSettings(backend::AutoPolyesterForwardDiff, x_or_N)
+    return DI.BatchSizeSettings(single_threaded(backend), x_or_N)
 end
 
 function DI.threshold_batchsize(
     backend::AutoPolyesterForwardDiff{chunksize1}, chunksize2::Integer
 ) where {chunksize1}
-    chunksize = (chunksize1 === nothing) ? nothing : min(chunksize1, chunksize2)
+    chunksize = isnothing(chunksize1) ? nothing : min(chunksize1, chunksize2)
     return AutoPolyesterForwardDiff(; chunksize, tag=backend.tag)
 end
 
 
@@ -10,18 +10,19 @@ using ADTypes:
     hessian_sparsity
 using DifferentiationInterface
 using DifferentiationInterface:
+    BatchSizeSettings,
     GradientPrep,
     HessianPrep,
     HVPPrep,
     JacobianPrep,
     PullbackPrep,
     PushforwardPrep,
     PushforwardFast,
-    PushforwardSlow,
+    PushforwardPerformance,
     inner,
+    outer,
     multibasis,
-    pick_hessian_batchsize,
-    pick_jacobian_batchsize,
+    pick_batchsize,
     pushforward_performance,
     unwrap,
     with_contexts