From aa2c2e45bc31696eb3df22de9c1bcae9c3d5abdf Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Tue, 6 May 2025 10:15:52 +0200
Subject: [PATCH 1/8] perf: allocate Enzyme shadow memory during preparation

---
 DifferentiationInterface/Project.toml         |   2 +-
 .../forward_onearg.jl                         | 107 ++++++----
 .../forward_twoarg.jl                         |  40 ++--
 .../reverse_onearg.jl                         | 104 +++++----
 .../reverse_twoarg.jl                         |  67 +++---
 .../utils.jl                                  | 197 ++++++++++++++----
 ...ionInterfaceSparseConnectivityTracerExt.jl |   4 +-
 7 files changed, 354 insertions(+), 167 deletions(-)

diff --git a/DifferentiationInterface/Project.toml b/DifferentiationInterface/Project.toml
index 75fbf7634..1a1916f3a 100644
--- a/DifferentiationInterface/Project.toml
+++ b/DifferentiationInterface/Project.toml
@@ -1,7 +1,7 @@
 name = "DifferentiationInterface"
 uuid = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 authors = ["Guillaume Dalle", "Adrian Hill"]
-version = "0.6.52"
+version = "0.6.53"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl
index 9ce785d99..4b0c855b2 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl
@@ -1,83 +1,96 @@
 ## Pushforward
 
+struct EnzymeOneArgPushforwardPrep{SIG,DF,DC} <: DI.PushforwardPrep{SIG}
+    _sig::Val{SIG}
+    df::DF
+    context_shadows::DC
+end
+
 function DI.prepare_pushforward_nokwarg(
     strict::Val,
     f::F,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
-    tx::NTuple,
+    tx::NTuple{B},
     contexts::Vararg{DI.Context,C};
-) where {F,C}
+) where {F,C,B}
     _sig = DI.signature(f, backend, x, tx, contexts...; strict)
-    return DI.NoPushforwardPrep(_sig)
+    df = function_shadow(f, backend, Val(B))
+    mode = forward_withprimal(backend)
+    context_shadows = shadows(backend, mode, Val(B), contexts...)
+    return EnzymeOneArgPushforwardPrep(_sig, df, context_shadows)
 end
 
 function DI.value_and_pushforward(
     f::F,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeOneArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple{1},
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, tx, contexts...)
+    (; df, context_shadows) = prep
     mode = forward_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
     dx = only(tx)
     x_and_dx = Duplicated(x, dx)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     dy, y = autodiff(mode, f_and_df, x_and_dx, annotated_contexts...)
     return y, (dy,)
 end
 
 function DI.value_and_pushforward(
     f::F,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeOneArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple{B},
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f, prep, backend, x, tx, contexts...)
+    (; df, context_shadows) = prep
     mode = forward_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode, Val(B))
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(B))
     x_and_tx = BatchDuplicated(x, tx)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     ty, y = autodiff(mode, f_and_df, x_and_tx, annotated_contexts...)
     return y, values(ty)
 end
 
 function DI.pushforward(
     f::F,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeOneArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple{1},
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, tx, contexts...)
+    (; df, context_shadows) = prep
     mode = forward_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
     dx = only(tx)
     x_and_dx = Duplicated(x, dx)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     dy = only(autodiff(mode, f_and_df, x_and_dx, annotated_contexts...))
     return (dy,)
 end
 
 function DI.pushforward(
     f::F,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeOneArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple{B},
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f, prep, backend, x, tx, contexts...)
+    (; df, context_shadows) = prep
     mode = forward_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode, Val(B))
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(B))
     x_and_tx = BatchDuplicated(x, tx)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     ty = only(autodiff(mode, f_and_df, x_and_tx, annotated_contexts...))
     return values(ty)
 end
@@ -85,7 +98,7 @@ end
 function DI.value_and_pushforward!(
     f::F,
     ty::NTuple,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeOneArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple,
@@ -101,7 +114,7 @@ end
 function DI.pushforward!(
     f::F,
     ty::NTuple,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeOneArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple,
@@ -116,10 +129,12 @@ end
 
 ## Gradient
 
-struct EnzymeForwardGradientPrep{SIG,B,O} <: DI.GradientPrep{SIG}
+struct EnzymeForwardGradientPrep{SIG,B,DF,DC,O} <: DI.GradientPrep{SIG}
     _sig::Val{SIG}
     _valB::Val{B}
-    shadows::O
+    df::DF
+    context_shadows::DC
+    basis_shadows::O
 end
 
 function DI.prepare_gradient_nokwarg(
@@ -131,8 +146,11 @@ function DI.prepare_gradient_nokwarg(
 ) where {F,C}
     _sig = DI.signature(f, backend, x, contexts...; strict)
     valB = to_val(DI.pick_batchsize(backend, x))
-    shadows = create_shadows(valB, x)
-    return EnzymeForwardGradientPrep(_sig, valB, shadows)
+    df = function_shadow(f, backend, valB)
+    mode = forward_withprimal(backend)
+    context_shadows = shadows(backend, mode, valB, contexts...)
+    basis_shadows = create_shadows(valB, x)
+    return EnzymeForwardGradientPrep(_sig, valB, df, context_shadows, basis_shadows)
 end
 
 function DI.gradient(
@@ -143,11 +161,12 @@ function DI.gradient(
     contexts::Vararg{DI.Constant,C},
 ) where {F,SIG,B,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows, basis_shadows) = prep
     mode = forward_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(B))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     derivs = gradient(
-        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=prep.shadows
+        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=basis_shadows
     )
     return first(derivs)
 end
@@ -160,11 +179,12 @@ function DI.value_and_gradient(
     contexts::Vararg{DI.Constant,C},
 ) where {F,SIG,B,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows, basis_shadows) = prep
     mode = forward_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(B))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     (; derivs, val) = gradient(
-        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=prep.shadows
+        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=basis_shadows
     )
     return val, first(derivs)
 end
@@ -196,10 +216,12 @@ end
 
 ## Jacobian
 
-struct EnzymeForwardOneArgJacobianPrep{SIG,B,O} <: DI.JacobianPrep{SIG}
+struct EnzymeForwardOneArgJacobianPrep{SIG,B,DF,DC,O} <: DI.JacobianPrep{SIG}
     _sig::Val{SIG}
     _valB::Val{B}
-    shadows::O
+    df::DF
+    context_shadows::DC
+    basis_shadows::O
     output_length::Int
 end
 
@@ -213,8 +235,13 @@ function DI.prepare_jacobian_nokwarg(
     _sig = DI.signature(f, backend, x, contexts...; strict)
     y = f(x, map(DI.unwrap, contexts)...)
     valB = to_val(DI.pick_batchsize(backend, x))
-    shadows = create_shadows(valB, x)
-    return EnzymeForwardOneArgJacobianPrep(_sig, valB, shadows, length(y))
+    mode = forward_withprimal(backend)
+    df = function_shadow(f, backend, valB)
+    context_shadows = shadows(backend, mode, valB, contexts...)
+    basis_shadows = create_shadows(valB, x)
+    return EnzymeForwardOneArgJacobianPrep(
+        _sig, valB, df, context_shadows, basis_shadows, length(y)
+    )
 end
 
 function DI.jacobian(
@@ -225,14 +252,15 @@ function DI.jacobian(
     contexts::Vararg{DI.Constant,C},
 ) where {F,SIG,B,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows, basis_shadows, output_length) = prep
     mode = forward_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(B))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     derivs = jacobian(
-        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=prep.shadows
+        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=basis_shadows
     )
     jac_tensor = first(derivs)
-    return maybe_reshape(jac_tensor, prep.output_length, length(x))
+    return maybe_reshape(jac_tensor, output_length, length(x))
 end
 
 function DI.value_and_jacobian(
@@ -243,14 +271,15 @@ function DI.value_and_jacobian(
     contexts::Vararg{DI.Constant,C},
 ) where {F,SIG,B,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows, basis_shadows, output_length) = prep
     mode = forward_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(B))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     (; derivs, val) = jacobian(
-        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=prep.shadows
+        mode, f_and_df, x, annotated_contexts...; chunk=Val(B), shadows=basis_shadows
     )
     jac_tensor = first(derivs)
-    return val, maybe_reshape(jac_tensor, prep.output_length, length(x))
+    return val, maybe_reshape(jac_tensor, output_length, length(x))
 end
 
 function DI.jacobian!(
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl
index 4d8328c3e..d7560f3dd 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl
@@ -1,35 +1,45 @@
 ## Pushforward
 
+struct EnzymeTwoArgPushforwardPrep{SIG,DF,DC} <: DI.PushforwardPrep{SIG}
+    _sig::Val{SIG}
+    df!::DF
+    context_shadows::DC
+end
+
 function DI.prepare_pushforward_nokwarg(
     strict::Val,
     f!::F,
     y,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
-    tx::NTuple,
+    tx::NTuple{B},
     contexts::Vararg{DI.Context,C};
-) where {F,C}
+) where {F,B,C}
     _sig = DI.signature(f!, y, backend, x, tx, contexts...; strict)
-    return DI.NoPushforwardPrep(_sig)
+    df! = function_shadow(f!, backend, Val(B))
+    mode = forward_noprimal(backend)
+    context_shadows = shadows(backend, mode, Val(B), contexts...)
+    return EnzymeTwoArgPushforwardPrep(_sig, df!, context_shadows)
 end
 
 function DI.value_and_pushforward(
     f!::F,
     y,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeTwoArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple{1},
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f!, y, prep, backend, x, tx, contexts...)
+    (; df!, context_shadows) = prep
     mode = forward_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode)
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(1))
     dx = only(tx)
     dy = make_zero(y)
     x_and_dx = Duplicated(x, dx)
     y_and_dy = Duplicated(y, dy)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     autodiff(mode, f!_and_df!, Const, y_and_dy, x_and_dx, annotated_contexts...)
     return y, (dy,)
 end
@@ -37,19 +47,20 @@ end
 function DI.value_and_pushforward(
     f!::F,
     y,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeTwoArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple{B},
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f!, y, prep, backend, x, tx, contexts...)
+    (; df!, context_shadows) = prep
     mode = forward_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode, Val(B))
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(B))
     ty = ntuple(_ -> make_zero(y), Val(B))
     x_and_tx = BatchDuplicated(x, tx)
     y_and_ty = BatchDuplicated(y, ty)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     autodiff(mode, f!_and_df!, Const, y_and_ty, x_and_tx, annotated_contexts...)
     return y, ty
 end
@@ -57,7 +68,7 @@ end
 function DI.pushforward(
     f!::F,
     y,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeTwoArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple,
@@ -72,18 +83,19 @@ function DI.value_and_pushforward!(
     f!::F,
     y,
     ty::NTuple{B},
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeTwoArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple{B},
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f!, y, prep, backend, x, tx, contexts...)
+    (; df!, context_shadows) = prep
     mode = forward_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode, Val(B))
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(B))
     x_and_tx = BatchDuplicated(x, tx)
     y_and_ty = BatchDuplicated(y, ty)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     autodiff(mode, f!_and_df!, Const, y_and_ty, x_and_tx, annotated_contexts...)
     return y, ty
 end
@@ -92,7 +104,7 @@ function DI.pushforward!(
     f!::F,
     y,
     ty::NTuple,
-    prep::DI.NoPushforwardPrep,
+    prep::EnzymeTwoArgPushforwardPrep,
     backend::AutoEnzyme{<:Union{ForwardMode,Nothing}},
     x,
     tx::NTuple,
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl
index b0a52fb92..b4c0aa3b6 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl
@@ -47,8 +47,10 @@ end
 
 ## Pullback
 
-struct EnzymeReverseOneArgPullbackPrep{SIG,Y} <: DI.PullbackPrep{SIG}
+struct EnzymeReverseOneArgPullbackPrep{SIG,DF,DC,Y} <: DI.PullbackPrep{SIG}
     _sig::Val{SIG}
+    df::DF
+    context_shadows::DC
     y_example::Y  # useful to create return activity
 end
 
@@ -57,12 +59,15 @@ function DI.prepare_pullback_nokwarg(
     f::F,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing}},
     x,
-    ty::NTuple,
+    ty::NTuple{B},
     contexts::Vararg{DI.Context,C};
-) where {F,C}
+) where {F,B,C}
     _sig = DI.signature(f, backend, x, ty, contexts...; strict)
+    df = function_shadow(f, backend, Val(B))
+    mode = reverse_split_withprimal(backend)
+    context_shadows = shadows(backend, mode, Val(B), contexts...)
     y = f(x, map(DI.unwrap, contexts)...)
-    return EnzymeReverseOneArgPullbackPrep(_sig, y)
+    return EnzymeReverseOneArgPullbackPrep(_sig, df, context_shadows, y)
 end
 
 ### Out-of-place
@@ -76,12 +81,13 @@ function DI.value_and_pullback(
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, ty, contexts...)
+    (; df, context_shadows, y_example) = prep
     mode = reverse_split_withprimal(backend)
-    f_and_df = force_annotation(get_f_and_df(f, backend, mode))
+    f_and_df = force_annotation(get_f_and_df_prepared!(df, f, backend, Val(1)))
     IA = guess_activity(typeof(x), mode)
-    RA = guess_activity(typeof(prep.y_example), mode)
+    RA = guess_activity(typeof(y_example), mode)
     dx = make_zero(x)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     dinputs, result = seeded_autodiff_thunk(
         mode, only(ty), f_and_df, RA, annotate(IA, x, dx), annotated_contexts...
     )
@@ -102,12 +108,13 @@ function DI.value_and_pullback(
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f, prep, backend, x, ty, contexts...)
+    (; df, context_shadows, y_example) = prep
     mode = reverse_split_withprimal(backend)
-    f_and_df = force_annotation(get_f_and_df(f, backend, mode, Val(B)))
+    f_and_df = force_annotation(get_f_and_df_prepared!(df, f, backend, Val(B)))
     IA = batchify_activity(guess_activity(typeof(x), mode), Val(B))
-    RA = batchify_activity(guess_activity(typeof(prep.y_example), mode), Val(B))
+    RA = batchify_activity(guess_activity(typeof(y_example), mode), Val(B))
     tx = ntuple(_ -> make_zero(x), Val(B))
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     dinputs, result = batch_seeded_autodiff_thunk(
         mode, ty, f_and_df, RA, annotate(IA, x, tx), annotated_contexts...
     )
@@ -143,12 +150,13 @@ function DI.value_and_pullback!(
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, ty, contexts...)
+    (; df, context_shadows, y_example) = prep
     mode = reverse_split_withprimal(backend)
-    f_and_df = force_annotation(get_f_and_df(f, backend, mode))
-    RA = guess_activity(typeof(prep.y_example), mode)
+    f_and_df = force_annotation(get_f_and_df_prepared!(df, f, backend, Val(1)))
+    RA = guess_activity(typeof(y_example), mode)
     dx = only(tx)
     make_zero!(dx)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     _, result = seeded_autodiff_thunk(
         mode, only(ty), f_and_df, RA, Duplicated(x, dx), annotated_contexts...
     )
@@ -165,11 +173,12 @@ function DI.value_and_pullback!(
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f, prep, backend, x, ty, contexts...)
+    (; df, context_shadows, y_example) = prep
     mode = reverse_split_withprimal(backend)
-    f_and_df = force_annotation(get_f_and_df(f, backend, mode, Val(B)))
-    RA = batchify_activity(guess_activity(typeof(prep.y_example), mode), Val(B))
+    f_and_df = force_annotation(get_f_and_df_prepared!(df, f, backend, Val(B)))
+    RA = batchify_activity(guess_activity(typeof(y_example), mode), Val(B))
     make_zero!(tx)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     _, result = batch_seeded_autodiff_thunk(
         mode, ty, f_and_df, RA, BatchDuplicated(x, tx), annotated_contexts...
     )
@@ -191,6 +200,12 @@ end
 
 ## Gradient
 
+struct EnzymeGradientPrep{SIG,DF,DC} <: DI.GradientPrep{SIG}
+    _sig::Val{SIG}
+    df::DF
+    context_shadows::DC
+end
+
 function DI.prepare_gradient_nokwarg(
     strict::Val,
     f::F,
@@ -199,37 +214,42 @@ function DI.prepare_gradient_nokwarg(
     contexts::Vararg{DI.Context,C};
 ) where {F,C}
     _sig = DI.signature(f, backend, x, contexts...; strict)
-    return DI.NoGradientPrep(_sig)
+    df = function_shadow(f, backend, Val(1))
+    mode = reverse_withprimal(backend)
+    context_shadows = shadows(backend, mode, Val(1), contexts...)
+    return EnzymeGradientPrep(_sig, df, context_shadows)
 end
 
 ### Enzyme gradient API (only constants)
 
 function DI.gradient(
     f::F,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing},<:Union{Nothing,Const}},
     x,
     contexts::Vararg{DI.Constant,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows) = prep
     mode = reverse_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     grads = gradient(mode, f_and_df, x, annotated_contexts...)
     return first(grads)
 end
 
 function DI.value_and_gradient(
     f::F,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing},<:Union{Nothing,Const}},
     x,
     contexts::Vararg{DI.Constant,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows) = prep
     mode = reverse_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     grads, result = gradient(mode, f_and_df, x, annotated_contexts...)
     return result, first(grads)
 end
@@ -237,13 +257,14 @@ end
 function DI.gradient!(
     f::F,
     grad,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing},<:Union{Nothing,Const}},
     x,
 ) where {F}
     DI.check_prep(f, prep, backend, x)
+    (; df) = prep
     mode = reverse_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
     gradient!(mode, grad, f_and_df, x)
     return grad
 end
@@ -251,13 +272,14 @@ end
 function DI.value_and_gradient!(
     f::F,
     grad,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing},<:Union{Nothing,Const}},
     x,
 ) where {F}
     DI.check_prep(f, prep, backend, x)
+    (; df) = prep
     mode = reverse_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
     _, result = gradient!(mode, grad, f_and_df, x)
     return result, grad
 end
@@ -266,17 +288,18 @@ end
 
 function DI.gradient(
     f::F,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing}},
     x,
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows) = prep
     mode = reverse_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
     IA = guess_activity(typeof(x), mode)
     grad = make_zero(x)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     dinputs = only(
         autodiff(mode, f_and_df, Active, annotate(IA, x, grad), annotated_contexts...)
     )
@@ -290,17 +313,18 @@ end
 
 function DI.value_and_gradient(
     f::F,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing}},
     x,
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows) = prep
     mode = reverse_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
     IA = guess_activity(typeof(x), mode)
     grad = make_zero(x)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     dinputs, result = autodiff(
         mode, f_and_df, Active, annotate(IA, x, grad), annotated_contexts...
     )
@@ -315,15 +339,16 @@ end
 function DI.gradient!(
     f::F,
     grad,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing}},
     x,
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows) = prep
     mode = reverse_noprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     make_zero!(grad)
     autodiff(mode, f_and_df, Active, Duplicated(x, grad), annotated_contexts...)
     return grad
@@ -332,15 +357,16 @@ end
 function DI.value_and_gradient!(
     f::F,
     grad,
-    prep::DI.NoGradientPrep,
+    prep::EnzymeGradientPrep,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing}},
     x,
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f, prep, backend, x, contexts...)
+    (; df, context_shadows) = prep
     mode = reverse_withprimal(backend)
-    f_and_df = get_f_and_df(f, backend, mode)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    f_and_df = get_f_and_df_prepared!(df, f, backend, Val(1))
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     make_zero!(grad)
     _, y = autodiff(mode, f_and_df, Active, Duplicated(x, grad), annotated_contexts...)
     return y, grad
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl
index ae2b33923..2dfa98acb 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl
@@ -1,7 +1,9 @@
 ## Pullback
 
-struct EnzymeReverseTwoArgPullbackPrep{SIG,TY} <: DI.PullbackPrep{SIG}
+struct EnzymeReverseTwoArgPullbackPrep{SIG,DF,DC,TY} <: DI.PullbackPrep{SIG}
     _sig::Val{SIG}
+    df!::DF
+    context_shadows::DC
     ty_copy::TY
 end
 
@@ -11,12 +13,15 @@ function DI.prepare_pullback_nokwarg(
     y,
     backend::AutoEnzyme{<:Union{ReverseMode,Nothing}},
     x,
-    ty::NTuple,
+    ty::NTuple{B},
     contexts::Vararg{DI.Context,C};
-) where {F,C}
+) where {F,B,C}
     _sig = DI.signature(f!, y, backend, x, ty, contexts...; strict)
+    df! = function_shadow(f!, backend, Val(B))
+    mode = reverse_noprimal(backend)
+    context_shadows = shadows(backend, mode, Val(B), contexts...)
     ty_copy = map(copy, ty)
-    return EnzymeReverseTwoArgPullbackPrep(_sig, ty_copy)
+    return EnzymeReverseTwoArgPullbackPrep(_sig, df!, context_shadows, ty_copy)
 end
 
 function DI.value_and_pullback(
@@ -29,12 +34,13 @@ function DI.value_and_pullback(
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f!, y, prep, backend, x, ty, contexts...)
-    copyto!(only(prep.ty_copy), only(ty))
+    (; df!, context_shadows, ty_copy) = prep
+    copyto!(only(ty_copy), only(ty))
     mode = reverse_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode)
-    dy = only(prep.ty_copy)
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(1))
+    dy = only(ty_copy)
     y_and_dy = Duplicated(y, dy)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     dinputs = only(
         autodiff(mode, f!_and_df!, Const, y_and_dy, Active(x), annotated_contexts...)
     )
@@ -52,12 +58,13 @@ function DI.value_and_pullback(
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f!, y, prep, backend, x, ty, contexts...)
-    foreach(copyto!, prep.ty_copy, ty)
+    (; df!, context_shadows, ty_copy) = prep
+    foreach(copyto!, ty_copy, ty)
     mode = reverse_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode, Val(B))
-    ty = prep.ty_copy
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(B))
+    ty = ty_copy
     y_and_ty = BatchDuplicated(y, ty)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     dinputs = only(
         autodiff(mode, f!_and_df!, Const, y_and_ty, Active(x), annotated_contexts...)
     )
@@ -75,14 +82,15 @@ function DI.value_and_pullback(
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f!, y, prep, backend, x, ty, contexts...)
-    copyto!(only(prep.ty_copy), only(ty))
+    (; df!, context_shadows, ty_copy) = prep
+    copyto!(only(ty_copy), only(ty))
     mode = reverse_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode)
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(1))
     dx = make_zero(x)  # allocates
-    dy = only(prep.ty_copy)
+    dy = only(ty_copy)
     x_and_dx = Duplicated(x, dx)
     y_and_dy = Duplicated(y, dy)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     autodiff(mode, f!_and_df!, Const, y_and_dy, x_and_dx, annotated_contexts...)
     return y, (dx,)
 end
@@ -97,14 +105,15 @@ function DI.value_and_pullback(
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f!, y, prep, backend, x, ty, contexts...)
-    foreach(copyto!, prep.ty_copy, ty)
+    (; df!, context_shadows, ty_copy) = prep
+    foreach(copyto!, ty_copy, ty)
     mode = reverse_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode, Val(B))
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(B))
     tx = ntuple(_ -> make_zero(x), Val(B))  # allocates
-    ty = prep.ty_copy
+    ty = ty_copy
     x_and_tx = BatchDuplicated(x, tx)
     y_and_ty = BatchDuplicated(y, ty)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     autodiff(mode, f!_and_df!, Const, y_and_ty, x_and_tx, annotated_contexts...)
     return y, tx
 end
@@ -120,15 +129,16 @@ function DI.value_and_pullback!(
     contexts::Vararg{DI.Context,C},
 ) where {F,C}
     DI.check_prep(f!, y, prep, backend, x, ty, contexts...)
-    copyto!(only(prep.ty_copy), only(ty))
+    (; df!, context_shadows, ty_copy) = prep
+    copyto!(only(ty_copy), only(ty))
     mode = reverse_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode)
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(1))
     dx = only(tx)
     make_zero!(dx)
-    dy = only(prep.ty_copy)
+    dy = only(ty_copy)
     x_and_dx = Duplicated(x, dx)
     y_and_dy = Duplicated(y, dy)
-    annotated_contexts = translate(backend, mode, Val(1), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(1))
     autodiff(mode, f!_and_df!, Const, y_and_dy, x_and_dx, annotated_contexts...)
     return y, (dx,)
 end
@@ -144,14 +154,15 @@ function DI.value_and_pullback!(
     contexts::Vararg{DI.Context,C},
 ) where {F,B,C}
     DI.check_prep(f!, y, prep, backend, x, ty, contexts...)
-    foreach(copyto!, prep.ty_copy, ty)
+    (; df!, context_shadows, ty_copy) = prep
+    foreach(copyto!, ty_copy, ty)
     mode = reverse_noprimal(backend)
-    f!_and_df! = get_f_and_df(f!, backend, mode, Val(B))
+    f!_and_df! = get_f_and_df_prepared!(df!, f!, backend, Val(B))
     make_zero!(tx)
-    ty = prep.ty_copy
+    ty = ty_copy
     x_and_tx = BatchDuplicated(x, tx)
     y_and_ty = BatchDuplicated(y, ty)
-    annotated_contexts = translate(backend, mode, Val(B), contexts...)
+    annotated_contexts = translate_prepared!(context_shadows, contexts, Val(B))
     autodiff(mode, f!_and_df!, Const, y_and_ty, x_and_tx, annotated_contexts...)
     return y, tx
 end
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
index 64a27e2e5..b0f8fba35 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
@@ -8,84 +8,193 @@ to_val(::DI.BatchSizeSettings{B}) where {B} = Val(B)
 
 ## Annotations
 
-@inline function get_f_and_df(
-    f::F, backend::AutoEnzyme{M,Nothing}, mode::Mode, ::Val{B}=Val(1)
-) where {F,M,B}
+const AnyDuplicated = Union{
+    Duplicated,
+    MixedDuplicated,
+    BatchDuplicated,
+    BatchMixedDuplicated,
+    DuplicatedNoNeed,
+    BatchDuplicatedNoNeed,
+}
+
+function get_f_and_df(f::F, ::AutoEnzyme{M,Nothing}, ::Val{B}) where {F,M,B}
     return f
 end
 
-@inline function get_f_and_df(
-    f::F, backend::AutoEnzyme{M,<:Const}, mode::Mode, ::Val{B}=Val(1)
-) where {F,M,B}
+function get_f_and_df(f::F, ::AutoEnzyme{M,<:Const}, ::Val{B}) where {F,M,B}
     return Const(f)
 end
 
-@inline function get_f_and_df(
-    f::F,
-    backend::AutoEnzyme{
-        M,
-        <:Union{
-            Duplicated,
-            MixedDuplicated,
-            BatchDuplicated,
-            BatchMixedDuplicated,
-            DuplicatedNoNeed,
-            BatchDuplicatedNoNeed,
-        },
-    },
-    mode::Mode,
-    ::Val{B}=Val(1),
-) where {F,M,B}
+function get_f_and_df(f::F, backend::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}) where {F,M,B}
     # TODO: needs more sophistication for mixed activities
+    df = function_shadow(f, backend, Val(B))
+    if B == 1
+        return Duplicated(f, df)
+    else
+        return BatchDuplicated(f, df)
+    end
+end
+
+function function_shadow(
+    ::F, ::AutoEnzyme{M,<:Union{Const,Nothing}}, ::Val{B}
+) where {M,B,F}
+    return nothing
+end
+
+function function_shadow(f::F, ::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}) where {F,M,B}
+    if B == 1
+        return make_zero(f)
+    else
+        return ntuple(_ -> make_zero(f), Val(B))
+    end
+end
+
+function get_f_and_df_prepared!(_df, f::F, ::AutoEnzyme{M,Nothing}, ::Val{B}) where {F,M,B}
+    return f
+end
+
+function get_f_and_df_prepared!(_df, f::F, ::AutoEnzyme{M,<:Const}, ::Val{B}) where {F,M,B}
+    return Const(f)
+end
+
+function get_f_and_df_prepared!(
+    df, f::F, ::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}
+) where {F,M,B}
+    make_zero!(df)
     if B == 1
-        return Duplicated(f, make_zero(f))
+        return Duplicated(f, df)
     else
-        return BatchDuplicated(f, ntuple(_ -> make_zero(f), Val(B)))
+        return BatchDuplicated(f, df)
     end
 end
 
 force_annotation(f::F) where {F<:Annotation} = f
 force_annotation(f::F) where {F} = Const(f)
 
-@inline function _translate(
-    backend::AutoEnzyme, ::Mode, ::Val{B}, c::DI.GeneralizedConstant
+function _translate(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Constant) where {B}
+    c = DI.unwrap(c_wrapped)
+    return Const(c)
+end
+
+function _translate(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Cache) where {B}
+    c = DI.unwrap(c_wrapped)
+    if B == 1
+        dc = make_zero(c)
+        return Duplicated(c, dc)
+    else
+        dc = ntuple(_ -> make_zero(c), Val(B))
+        return BatchDuplicated(c, dc)
+    end
+end
+
+function _translate(
+    backend::AutoEnzyme, mode::Mode, ::Val{B}, c_wrapped::DI.ConstantOrCache
 ) where {B}
-    return Const(DI.unwrap(c))
+    c = DI.unwrap(c_wrapped)
+    IA = guess_activity(typeof(c), mode)
+    if IA <: Const
+        return _translate(backend, mode, Val(B), DI.Constant(c))
+    else
+        return _translate(backend, mode, Val(B), DI.Cache(c))
+    end
 end
 
-@inline function _translate(
-    backend::AutoEnzyme, mode::Mode, ::Val{B}, c::DI.Cache
+function _translate(
+    backend::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.FunctionContext
 ) where {B}
-    # important to keep make_zero here for ConstantOrCache instead of similar
+    f = DI.unwrap(c_wrapped)
+    return force_annotation(get_f_and_df(f, backend, Val(B)))
+end
+
+function translate(
+    backend::AutoEnzyme, mode::Mode, ::Val{B}, contexts::Vararg{DI.Context,C}
+) where {B,C}
+    new_contexts = map(contexts) do c_wrapped
+        _translate(backend, mode, Val(B), c_wrapped)
+    end
+    return new_contexts
+end
+
+function _shadow(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Constant) where {B}
+    return nothing
+end
+
+function _shadow(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Cache) where {B}
+    c = DI.unwrap(c_wrapped)
     if B == 1
-        return Duplicated(DI.unwrap(c), make_zero(DI.unwrap(c)))
+        return make_zero(c)
     else
-        return BatchDuplicated(DI.unwrap(c), ntuple(_ -> make_zero(DI.unwrap(c)), Val(B)))
+        return ntuple(_ -> make_zero(c), Val(B))
     end
 end
 
-@inline function _translate(
-    backend::AutoEnzyme, mode::Mode, valB::Val{B}, c::DI.ConstantOrCache
+function _shadow(
+    backend::AutoEnzyme, mode::Mode, valB::Val{B}, c_wrapped::DI.ConstantOrCache
 ) where {B}
-    IA = guess_activity(typeof(DI.unwrap(c)), mode)
+    c = DI.unwrap(c_wrapped)
+    IA = guess_activity(typeof(c), mode)
     if IA <: Const
-        return _translate(backend, mode, valB, DI.Constant(DI.unwrap(c)))
+        return _shadow(backend, mode, valB, DI.Constant(c))
     else
-        return _translate(backend, mode, valB, DI.Cache(DI.unwrap(c)))
+        return _shadow(backend, mode, valB, DI.Cache(c))
     end
 end
 
-@inline function _translate(
-    backend::AutoEnzyme, mode::Mode, ::Val{B}, c::DI.FunctionContext
-) where {B}
-    return force_annotation(get_f_and_df(DI.unwrap(c), backend, mode, Val(B)))
+function _shadow(
+    backend::AutoEnzyme{M,<:Union{Const,Nothing}},
+    ::Mode,
+    ::Val{B},
+    c_wrapped::DI.FunctionContext,
+) where {M,B}
+    f = DI.unwrap(c_wrapped)
+    return function_shadow(f, backend, Val(B))
 end
 
-@inline function translate(
+function shadows(
     backend::AutoEnzyme, mode::Mode, ::Val{B}, contexts::Vararg{DI.Context,C}
 ) where {B,C}
-    new_contexts = map(contexts) do c
-        _translate(backend, mode, Val(B), c)
+    context_shadows = map(contexts) do c_wrapped
+        _shadow(backend, mode, Val(B), c_wrapped)
+    end
+    return context_shadows
+end
+
+function _translate_prepared!(dc, c_wrapped::DI.Constant, ::Val{B}) where {B}
+    c = DI.unwrap(c_wrapped)
+    return Const(c)
+end
+
+function _translate_prepared!(dc, c_wrapped::DI.Cache, ::Val{B}) where {B}
+    c = DI.unwrap(c_wrapped)
+    make_zero!(dc)
+    if B == 1
+        return Duplicated(c, dc)
+    else
+        return BatchDuplicated(c, dc)
+    end
+end
+
+function _translate_prepared!(
+    dc, c_wrapped::Union{DI.ConstantOrCache,DI.FunctionContext}, ::Val{B}
+) where {B}
+    c = DI.unwrap(c_wrapped)
+    if isnothing(dc)
+        return Constant(c)
+    else
+        make_zero!(dc)
+        if B == 1
+            return Duplicated(c, dc)
+        else
+            return BatchDuplicated(c, dc)
+        end
+    end
+end
+
+function translate_prepared!(
+    context_shadows::NTuple{C,Any}, contexts::NTuple{C,DI.Context}, ::Val{B}
+) where {B,C}
+    new_contexts = map(context_shadows, contexts) do dc, c_wrapped
+        _translate_prepared!(dc, c_wrapped, Val(B))
     end
     return new_contexts
 end
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl b/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl
index 2bb96701a..7afaa958b 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl
@@ -5,10 +5,10 @@ import DifferentiationInterface as DI
 using SparseConnectivityTracer:
     TracerSparsityDetector, TracerLocalSparsityDetector, jacobian_buffer, hessian_buffer
 
-@inline function _translate(::Type, c::Union{DI.GeneralizedConstant,DI.ConstantOrCache})
+function _translate(::Type, c::Union{DI.GeneralizedConstant,DI.ConstantOrCache})
     return DI.unwrap(c)
 end
-@inline function _translate(::Type{T}, c::DI.Cache) where {T}
+function _translate(::Type{T}, c::DI.Cache) where {T}
     return DI.recursive_similar(DI.unwrap(c), T)
 end
 

From c9156b201aa7ff71d9c0b53dae5a8cb51d67d71f Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Tue, 6 May 2025 11:24:22 +0200
Subject: [PATCH 2/8] Fix

---
 .../ext/DifferentiationInterfaceEnzymeExt/utils.jl        | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
index b0f8fba35..26a8d907b 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
@@ -175,16 +175,18 @@ function _translate_prepared!(dc, c_wrapped::DI.Cache, ::Val{B}) where {B}
 end
 
 function _translate_prepared!(
-    dc, c_wrapped::Union{DI.ConstantOrCache,DI.FunctionContext}, ::Val{B}
+    _dc, c_wrapped::Union{DI.ConstantOrCache,DI.FunctionContext}, ::Val{B}
 ) where {B}
     c = DI.unwrap(c_wrapped)
     if isnothing(dc)
-        return Constant(c)
+        return Const(c)
     else
-        make_zero!(dc)
+        # make_zero!(dc)  # doesn't work because of immutable values
         if B == 1
+            dc = make_zero(c)
             return Duplicated(c, dc)
         else
+            dc = ntuple(_ -> make_zero(c), Val(B))
             return BatchDuplicated(c, dc)
         end
     end

From bcfb32d23907c4b2076c9550ec1d3a7d3798a6ea Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Tue, 6 May 2025 11:48:24 +0200
Subject: [PATCH 3/8] Fix

---
 .../ext/DifferentiationInterfaceEnzymeExt/utils.jl     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
index 26a8d907b..350dd2933 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
@@ -175,7 +175,7 @@ function _translate_prepared!(dc, c_wrapped::DI.Cache, ::Val{B}) where {B}
 end
 
 function _translate_prepared!(
-    _dc, c_wrapped::Union{DI.ConstantOrCache,DI.FunctionContext}, ::Val{B}
+    dc, c_wrapped::Union{DI.ConstantOrCache,DI.FunctionContext}, ::Val{B}
 ) where {B}
     c = DI.unwrap(c_wrapped)
     if isnothing(dc)
@@ -183,11 +183,11 @@ function _translate_prepared!(
     else
         # make_zero!(dc)  # doesn't work because of immutable values
         if B == 1
-            dc = make_zero(c)
-            return Duplicated(c, dc)
+            dc_new = make_zero(c)
+            return Duplicated(c, dc_new)
         else
-            dc = ntuple(_ -> make_zero(c), Val(B))
-            return BatchDuplicated(c, dc)
+            dc_new = ntuple(_ -> make_zero(c), Val(B))
+            return BatchDuplicated(c, dc_new)
         end
     end
 end

From 9e6dcb040d677a8bc580d52de4aec1a480ca4a87 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Tue, 6 May 2025 13:30:42 +0200
Subject: [PATCH 4/8] Fixes

---
 .../utils.jl                                  |  9 ++---
 .../onearg.jl                                 |  4 +--
 .../twoarg.jl                                 |  4 +--
 .../test/Back/Enzyme/test.jl                  |  5 ++-
 ...ntiationInterfaceTestComponentArraysExt.jl |  2 +-
 .../src/scenarios/modify.jl                   |  4 +--
 .../src/tests/correctness_eval.jl             | 36 +++++++------------
 7 files changed, 25 insertions(+), 39 deletions(-)

diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
index 350dd2933..87071012d 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
@@ -60,7 +60,6 @@ end
 function get_f_and_df_prepared!(
     df, f::F, ::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}
 ) where {F,M,B}
-    make_zero!(df)
     if B == 1
         return Duplicated(f, df)
     else
@@ -166,7 +165,6 @@ end
 
 function _translate_prepared!(dc, c_wrapped::DI.Cache, ::Val{B}) where {B}
     c = DI.unwrap(c_wrapped)
-    make_zero!(dc)
     if B == 1
         return Duplicated(c, dc)
     else
@@ -181,13 +179,10 @@ function _translate_prepared!(
     if isnothing(dc)
         return Const(c)
     else
-        # make_zero!(dc)  # doesn't work because of immutable values
         if B == 1
-            dc_new = make_zero(c)
-            return Duplicated(c, dc_new)
+            return Duplicated(c, dc)
         else
-            dc_new = ntuple(_ -> make_zero(c), Val(B))
-            return BatchDuplicated(c, dc_new)
+            return BatchDuplicated(c, dc)
         end
     end
 end
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/onearg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/onearg.jl
index d20b31f4b..58baad569 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/onearg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/onearg.jl
@@ -46,7 +46,7 @@ function DI.pushforward(
     DI.check_prep(f, prep, backend, x, tx, contexts...)
     fc = DI.fix_tail(f, map(DI.unwrap, contexts)...)
     ty = map(tx) do dx
-        foreach((t, xi, dxi) -> (t[0] = xi; t[1] = dxi), prep.xt, x, dx)
+        foreach((t, xi, dxi) -> (t[0]=xi; t[1]=dxi), prep.xt, x, dx)
         yt = fc(prep.xt)
         if yt isa Number
             return yt[1]
@@ -71,7 +71,7 @@ function DI.pushforward!(
     fc = DI.fix_tail(f, map(DI.unwrap, contexts)...)
     for b in eachindex(tx, ty)
         dx, dy = tx[b], ty[b]
-        foreach((t, xi, dxi) -> (t[0] = xi; t[1] = dxi), prep.xt, x, dx)
+        foreach((t, xi, dxi) -> (t[0]=xi; t[1]=dxi), prep.xt, x, dx)
         yt = fc(prep.xt)
         map!(t -> t[1], dy, yt)
     end
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/twoarg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/twoarg.jl
index 9f8cc3cf3..5edbdf9e5 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/twoarg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceGTPSAExt/twoarg.jl
@@ -56,7 +56,7 @@ function DI.pushforward(
     DI.check_prep(f!, y, prep, backend, x, tx, contexts...)
     fc! = DI.fix_tail(f!, map(DI.unwrap, contexts)...)
     ty = map(tx) do dx
-        foreach((t, xi, dxi) -> (t[0] = xi; t[1] = dxi), prep.xt, x, dx)
+        foreach((t, xi, dxi) -> (t[0]=xi; t[1]=dxi), prep.xt, x, dx)
         fc!(prep.yt, prep.xt)
         dy = map(t -> t[1], prep.yt)
         return dy
@@ -79,7 +79,7 @@ function DI.pushforward!(
     fc! = DI.fix_tail(f!, map(DI.unwrap, contexts)...)
     for b in eachindex(tx, ty)
         dx, dy = tx[b], ty[b]
-        foreach((t, xi, dxi) -> (t[0] = xi; t[1] = dxi), prep.xt, x, dx)
+        foreach((t, xi, dxi) -> (t[0]=xi; t[1]=dxi), prep.xt, x, dx)
         fc!(prep.yt, prep.xt)
         map!(t -> t[1], dy, prep.yt)
     end
diff --git a/DifferentiationInterface/test/Back/Enzyme/test.jl b/DifferentiationInterface/test/Back/Enzyme/test.jl
index a772e48ba..d8764aebf 100644
--- a/DifferentiationInterface/test/Back/Enzyme/test.jl
+++ b/DifferentiationInterface/test/Back/Enzyme/test.jl
@@ -68,7 +68,10 @@ end;
 
     test_differentiation(
         duplicated_backends,
-        default_scenarios(; include_normal=false, include_closurified=true);
+        filter(
+            s -> !(s.y isa Matrix),  # TODO: remove
+            default_scenarios(; include_normal=false, include_closurified=true),
+        );
         excluded=SECOND_ORDER,
         logging=LOGGING,
     )
diff --git a/DifferentiationInterfaceTest/ext/DifferentiationInterfaceTestComponentArraysExt/DifferentiationInterfaceTestComponentArraysExt.jl b/DifferentiationInterfaceTest/ext/DifferentiationInterfaceTestComponentArraysExt/DifferentiationInterfaceTestComponentArraysExt.jl
index 14aca7d77..ec5e37b27 100644
--- a/DifferentiationInterfaceTest/ext/DifferentiationInterfaceTestComponentArraysExt/DifferentiationInterfaceTestComponentArraysExt.jl
+++ b/DifferentiationInterfaceTest/ext/DifferentiationInterfaceTestComponentArraysExt/DifferentiationInterfaceTestComponentArraysExt.jl
@@ -10,7 +10,7 @@ function comp_to_num(x::ComponentVector)::Number
     return sum(sin.(x.a)) + sum(cos.(x.b))
 end
 
-comp_to_num_gradient(x) = ComponentVector(; a=cos.(x.a), b=-sin.(x.b))
+comp_to_num_gradient(x) = ComponentVector(; a=cos.(x.a), b=(-sin.(x.b)))
 
 function comp_to_num_pushforward(x, dx)
     g = comp_to_num_gradient(x)
diff --git a/DifferentiationInterfaceTest/src/scenarios/modify.jl b/DifferentiationInterfaceTest/src/scenarios/modify.jl
index cd1fb521e..800b84243 100644
--- a/DifferentiationInterfaceTest/src/scenarios/modify.jl
+++ b/DifferentiationInterfaceTest/src/scenarios/modify.jl
@@ -115,13 +115,13 @@ end
 Base.show(io::IO, f::WritableClosure) = print(io, "WritableClosure($(f.f))")
 
 function (mc::WritableClosure{:out})(x)
-    mc.x_buffer[1] = x
+    mc.x_buffer[1] = copy(x)
     mc.y_buffer[1] = mc.f(x)
     return copy(mc.y_buffer[1])
 end
 
 function (mc::WritableClosure{:in})(y, x)
-    mc.x_buffer[1] = x
+    mc.x_buffer[1] = copy(x)
     mc.f(mc.y_buffer[1], mc.x_buffer[1])
     copyto!(y, mc.y_buffer[1])
     return nothing
diff --git a/DifferentiationInterfaceTest/src/tests/correctness_eval.jl b/DifferentiationInterfaceTest/src/tests/correctness_eval.jl
index 9ed8013f8..8b183059e 100644
--- a/DifferentiationInterfaceTest/src/tests/correctness_eval.jl
+++ b/DifferentiationInterfaceTest/src/tests/correctness_eval.jl
@@ -56,8 +56,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -124,8 +123,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -208,8 +206,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -286,8 +283,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -375,8 +371,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -445,8 +440,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -532,8 +526,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -599,8 +592,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -682,8 +674,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -765,8 +756,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -867,8 +857,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)
@@ -934,8 +923,7 @@ for op in ALL_OPS
             contextsrand = rewrap(map(myrandom ∘ unwrap, contexts)...)
             local prepstrict
             preptup_cands_val, preptup_cands_noval = map(1:2) do _
-                new_smaller =
-                    if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
+                new_smaller = if isnothing(smaller) || adapt_batchsize(ba, smaller) != ba
                         deepcopy(scen)
                     else
                         deepcopy(smaller)

From a7e4de6298ad82f21a26bbde3fd99f2d6a19e2b4 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Tue, 6 May 2025 14:11:21 +0200
Subject: [PATCH 5/8] Allow JuliaFormatter 2

---
 DifferentiationInterface/Project.toml     | 2 +-
 DifferentiationInterfaceTest/Project.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DifferentiationInterface/Project.toml b/DifferentiationInterface/Project.toml
index 1a1916f3a..852718e42 100644
--- a/DifferentiationInterface/Project.toml
+++ b/DifferentiationInterface/Project.toml
@@ -65,7 +65,7 @@ FiniteDifferences = "0.12.31"
 ForwardDiff = "0.10.36,1"
 GPUArraysCore = "0.2"
 GTPSA = "1.4.0"
-JuliaFormatter = "1"
+JuliaFormatter = "1,2"
 LinearAlgebra = "<0.0.1,1"
 Mooncake = "0.4.88"
 PolyesterForwardDiff = "0.1.2"
diff --git a/DifferentiationInterfaceTest/Project.toml b/DifferentiationInterfaceTest/Project.toml
index a1aac1eca..26b7ccd4c 100644
--- a/DifferentiationInterfaceTest/Project.toml
+++ b/DifferentiationInterfaceTest/Project.toml
@@ -52,7 +52,7 @@ ForwardDiff = "0.10.36,1"
 Functors = "0.4, 0.5"
 JET = "0.4 - 0.8, 0.9"
 JLArrays = "0.1, 0.2"
-JuliaFormatter = "1"
+JuliaFormatter = "1,2"
 LinearAlgebra = "<0.0.1,1"
 Lux = "1.1.0"
 LuxTestUtils = "1.3.1"

From 592e299e9b0c765689bd253b7bef12d53a9f7064 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Tue, 6 May 2025 15:02:42 +0200
Subject: [PATCH 6/8] Code coverage

---
 .../forward_onearg.jl                         |   6 +-
 .../forward_twoarg.jl                         |   2 +-
 .../reverse_onearg.jl                         |   4 +-
 .../reverse_twoarg.jl                         |   2 +-
 .../utils.jl                                  | 102 ++++--------------
 ...ionInterfaceSparseConnectivityTracerExt.jl |   4 +-
 6 files changed, 31 insertions(+), 89 deletions(-)

diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl
index 4b0c855b2..3dd70bbc7 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_onearg.jl
@@ -17,7 +17,7 @@ function DI.prepare_pushforward_nokwarg(
     _sig = DI.signature(f, backend, x, tx, contexts...; strict)
     df = function_shadow(f, backend, Val(B))
     mode = forward_withprimal(backend)
-    context_shadows = shadows(backend, mode, Val(B), contexts...)
+    context_shadows = make_context_shadows(backend, mode, Val(B), contexts...)
     return EnzymeOneArgPushforwardPrep(_sig, df, context_shadows)
 end
 
@@ -148,7 +148,7 @@ function DI.prepare_gradient_nokwarg(
     valB = to_val(DI.pick_batchsize(backend, x))
     df = function_shadow(f, backend, valB)
     mode = forward_withprimal(backend)
-    context_shadows = shadows(backend, mode, valB, contexts...)
+    context_shadows = make_context_shadows(backend, mode, valB, contexts...)
     basis_shadows = create_shadows(valB, x)
     return EnzymeForwardGradientPrep(_sig, valB, df, context_shadows, basis_shadows)
 end
@@ -237,7 +237,7 @@ function DI.prepare_jacobian_nokwarg(
     valB = to_val(DI.pick_batchsize(backend, x))
     mode = forward_withprimal(backend)
     df = function_shadow(f, backend, valB)
-    context_shadows = shadows(backend, mode, valB, contexts...)
+    context_shadows = make_context_shadows(backend, mode, valB, contexts...)
     basis_shadows = create_shadows(valB, x)
     return EnzymeForwardOneArgJacobianPrep(
         _sig, valB, df, context_shadows, basis_shadows, length(y)
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl
index d7560f3dd..f0d2a2d91 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/forward_twoarg.jl
@@ -18,7 +18,7 @@ function DI.prepare_pushforward_nokwarg(
     _sig = DI.signature(f!, y, backend, x, tx, contexts...; strict)
     df! = function_shadow(f!, backend, Val(B))
     mode = forward_noprimal(backend)
-    context_shadows = shadows(backend, mode, Val(B), contexts...)
+    context_shadows = make_context_shadows(backend, mode, Val(B), contexts...)
     return EnzymeTwoArgPushforwardPrep(_sig, df!, context_shadows)
 end
 
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl
index b4c0aa3b6..67b3989f0 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_onearg.jl
@@ -65,7 +65,7 @@ function DI.prepare_pullback_nokwarg(
     _sig = DI.signature(f, backend, x, ty, contexts...; strict)
     df = function_shadow(f, backend, Val(B))
     mode = reverse_split_withprimal(backend)
-    context_shadows = shadows(backend, mode, Val(B), contexts...)
+    context_shadows = make_context_shadows(backend, mode, Val(B), contexts...)
     y = f(x, map(DI.unwrap, contexts)...)
     return EnzymeReverseOneArgPullbackPrep(_sig, df, context_shadows, y)
 end
@@ -216,7 +216,7 @@ function DI.prepare_gradient_nokwarg(
     _sig = DI.signature(f, backend, x, contexts...; strict)
     df = function_shadow(f, backend, Val(1))
     mode = reverse_withprimal(backend)
-    context_shadows = shadows(backend, mode, Val(1), contexts...)
+    context_shadows = make_context_shadows(backend, mode, Val(1), contexts...)
     return EnzymeGradientPrep(_sig, df, context_shadows)
 end
 
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl
index 2dfa98acb..18c4d9c68 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/reverse_twoarg.jl
@@ -19,7 +19,7 @@ function DI.prepare_pullback_nokwarg(
     _sig = DI.signature(f!, y, backend, x, ty, contexts...; strict)
     df! = function_shadow(f!, backend, Val(B))
     mode = reverse_noprimal(backend)
-    context_shadows = shadows(backend, mode, Val(B), contexts...)
+    context_shadows = make_context_shadows(backend, mode, Val(B), contexts...)
     ty_copy = map(copy, ty)
     return EnzymeReverseTwoArgPullbackPrep(_sig, df!, context_shadows, ty_copy)
 end
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
index 87071012d..16356ff4a 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
@@ -1,3 +1,12 @@
+const AnyDuplicated = Union{
+    Duplicated,
+    MixedDuplicated,
+    BatchDuplicated,
+    BatchMixedDuplicated,
+    DuplicatedNoNeed,
+    BatchDuplicatedNoNeed,
+}
+
 # until https://github.com/EnzymeAD/Enzyme.jl/pull/1545 is merged
 function DI.pick_batchsize(::AutoEnzyme, N::Integer)
     B = DI.reasonable_batchsize(N, 16)
@@ -8,26 +17,17 @@ to_val(::DI.BatchSizeSettings{B}) where {B} = Val(B)
 
 ## Annotations
 
-const AnyDuplicated = Union{
-    Duplicated,
-    MixedDuplicated,
-    BatchDuplicated,
-    BatchMixedDuplicated,
-    DuplicatedNoNeed,
-    BatchDuplicatedNoNeed,
-}
-
-function get_f_and_df(f::F, ::AutoEnzyme{M,Nothing}, ::Val{B}) where {F,M,B}
+function get_f_and_df_prepared!(_df, f::F, ::AutoEnzyme{M,Nothing}, ::Val{B}) where {F,M,B}
     return f
 end
 
-function get_f_and_df(f::F, ::AutoEnzyme{M,<:Const}, ::Val{B}) where {F,M,B}
+function get_f_and_df_prepared!(_df, f::F, ::AutoEnzyme{M,<:Const}, ::Val{B}) where {F,M,B}
     return Const(f)
 end
 
-function get_f_and_df(f::F, backend::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}) where {F,M,B}
-    # TODO: needs more sophistication for mixed activities
-    df = function_shadow(f, backend, Val(B))
+function get_f_and_df_prepared!(
+    df, f::F, ::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}
+) where {F,M,B}
     if B == 1
         return Duplicated(f, df)
     else
@@ -49,71 +49,9 @@ function function_shadow(f::F, ::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}) where
     end
 end
 
-function get_f_and_df_prepared!(_df, f::F, ::AutoEnzyme{M,Nothing}, ::Val{B}) where {F,M,B}
-    return f
-end
-
-function get_f_and_df_prepared!(_df, f::F, ::AutoEnzyme{M,<:Const}, ::Val{B}) where {F,M,B}
-    return Const(f)
-end
-
-function get_f_and_df_prepared!(
-    df, f::F, ::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}
-) where {F,M,B}
-    if B == 1
-        return Duplicated(f, df)
-    else
-        return BatchDuplicated(f, df)
-    end
-end
-
 force_annotation(f::F) where {F<:Annotation} = f
 force_annotation(f::F) where {F} = Const(f)
 
-function _translate(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Constant) where {B}
-    c = DI.unwrap(c_wrapped)
-    return Const(c)
-end
-
-function _translate(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Cache) where {B}
-    c = DI.unwrap(c_wrapped)
-    if B == 1
-        dc = make_zero(c)
-        return Duplicated(c, dc)
-    else
-        dc = ntuple(_ -> make_zero(c), Val(B))
-        return BatchDuplicated(c, dc)
-    end
-end
-
-function _translate(
-    backend::AutoEnzyme, mode::Mode, ::Val{B}, c_wrapped::DI.ConstantOrCache
-) where {B}
-    c = DI.unwrap(c_wrapped)
-    IA = guess_activity(typeof(c), mode)
-    if IA <: Const
-        return _translate(backend, mode, Val(B), DI.Constant(c))
-    else
-        return _translate(backend, mode, Val(B), DI.Cache(c))
-    end
-end
-
-function _translate(
-    backend::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.FunctionContext
-) where {B}
-    f = DI.unwrap(c_wrapped)
-    return force_annotation(get_f_and_df(f, backend, Val(B)))
-end
-
-function translate(
-    backend::AutoEnzyme, mode::Mode, ::Val{B}, contexts::Vararg{DI.Context,C}
-) where {B,C}
-    new_contexts = map(contexts) do c_wrapped
-        _translate(backend, mode, Val(B), c_wrapped)
-    end
-    return new_contexts
-end
-
 function _shadow(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Constant) where {B}
     return nothing
 end
@@ -128,14 +66,18 @@ function _shadow(::AutoEnzyme, ::Mode, ::Val{B}, c_wrapped::DI.Cache) where {B}
 end
 
 function _shadow(
-    backend::AutoEnzyme, mode::Mode, valB::Val{B}, c_wrapped::DI.ConstantOrCache
+    ::AutoEnzyme, mode::Mode, valB::Val{B}, c_wrapped::DI.ConstantOrCache
 ) where {B}
     c = DI.unwrap(c_wrapped)
     IA = guess_activity(typeof(c), mode)
     if IA <: Const
-        return _shadow(backend, mode, valB, DI.Constant(c))
+        nothing
     else
-        return _shadow(backend, mode, valB, DI.Cache(c))
+        if B == 1
+            return make_zero(c)
+        else
+            return ntuple(_ -> make_zero(c), Val(B))
+        end
     end
 end
 
@@ -149,7 +91,7 @@ function _shadow(
     return function_shadow(f, backend, Val(B))
 end
 
-function shadows(
+function make_context_shadows(
     backend::AutoEnzyme, mode::Mode, ::Val{B}, contexts::Vararg{DI.Context,C}
 ) where {B,C}
     context_shadows = map(contexts) do c_wrapped
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl b/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl
index 7afaa958b..2bb96701a 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceSparseConnectivityTracerExt/DifferentiationInterfaceSparseConnectivityTracerExt.jl
@@ -5,10 +5,10 @@ import DifferentiationInterface as DI
 using SparseConnectivityTracer:
     TracerSparsityDetector, TracerLocalSparsityDetector, jacobian_buffer, hessian_buffer
 
-function _translate(::Type, c::Union{DI.GeneralizedConstant,DI.ConstantOrCache})
+@inline function _translate(::Type, c::Union{DI.GeneralizedConstant,DI.ConstantOrCache})
     return DI.unwrap(c)
 end
-function _translate(::Type{T}, c::DI.Cache) where {T}
+@inline function _translate(::Type{T}, c::DI.Cache) where {T}
     return DI.recursive_similar(DI.unwrap(c), T)
 end
 

From 2a8a1797e4bd7725f6c60c49e5a8818b547ed528 Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Tue, 6 May 2025 22:30:21 +0200
Subject: [PATCH 7/8] Re-add matrix tests

---
 DifferentiationInterface/test/Back/Enzyme/test.jl | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/DifferentiationInterface/test/Back/Enzyme/test.jl b/DifferentiationInterface/test/Back/Enzyme/test.jl
index d8764aebf..a772e48ba 100644
--- a/DifferentiationInterface/test/Back/Enzyme/test.jl
+++ b/DifferentiationInterface/test/Back/Enzyme/test.jl
@@ -68,10 +68,7 @@ end;
 
     test_differentiation(
         duplicated_backends,
-        filter(
-            s -> !(s.y isa Matrix),  # TODO: remove
-            default_scenarios(; include_normal=false, include_closurified=true),
-        );
+        default_scenarios(; include_normal=false, include_closurified=true);
         excluded=SECOND_ORDER,
         logging=LOGGING,
     )

From d6ead1b3e99f30b6d1d5b64661514d9dcb69197d Mon Sep 17 00:00:00 2001
From: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
Date: Wed, 7 May 2025 09:51:51 +0200
Subject: [PATCH 8/8] Add finer tests and comments

---
 .../utils.jl                                  | 10 ++++
 .../src/scenarios/modify.jl                   | 56 +++++++++++++------
 2 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
index 16356ff4a..991796bb1 100644
--- a/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
+++ b/DifferentiationInterface/ext/DifferentiationInterfaceEnzymeExt/utils.jl
@@ -28,6 +28,11 @@ end
 function get_f_and_df_prepared!(
     df, f::F, ::AutoEnzyme{M,<:AnyDuplicated}, ::Val{B}
 ) where {F,M,B}
+    #=
+    It is not obvious why we don't need a `make_zero` here, in the case of mutable constant data in `f`.
+    - In forward mode, `df` is never incremented if `f` is not mutated, so it remains equal to its initial value of `0`.
+    - In reverse mode, `df` gets incremented but it does not influence the input cotangent `dx`.
+    =#
     if B == 1
         return Duplicated(f, df)
     else
@@ -117,6 +122,11 @@ end
 function _translate_prepared!(
     dc, c_wrapped::Union{DI.ConstantOrCache,DI.FunctionContext}, ::Val{B}
 ) where {B}
+    #=
+    It is not obvious why we don't need a `make_zero` here, in the case of mutable constant contexts.
+    - In forward mode, `dc` is never incremented because `c` is not mutated, so it remains equal to its initial value of `0`.
+    - In reverse mode, `dc` gets incremented but it does not influence the input cotangent `dx`.
+    =#
     c = DI.unwrap(c_wrapped)
     if isnothing(dc)
         return Const(c)
diff --git a/DifferentiationInterfaceTest/src/scenarios/modify.jl b/DifferentiationInterfaceTest/src/scenarios/modify.jl
index 800b84243..ee794892d 100644
--- a/DifferentiationInterfaceTest/src/scenarios/modify.jl
+++ b/DifferentiationInterfaceTest/src/scenarios/modify.jl
@@ -104,26 +104,31 @@ struct WritableClosure{pl_fun,F,X,Y} <: FunctionModifier
     f::F
     x_buffer::Vector{X}
     y_buffer::Vector{Y}
+    a::Float64
+    b::Vector{Float64}
 end
 
 function WritableClosure{pl_fun}(
-    f::F, x_buffer::Vector{X}, y_buffer::Vector{Y}
+    f::F, x_buffer::Vector{X}, y_buffer::Vector{Y}, a, b
 ) where {pl_fun,F,X,Y}
-    return WritableClosure{pl_fun,F,X,Y}(f, x_buffer, y_buffer)
+    return WritableClosure{pl_fun,F,X,Y}(f, x_buffer, y_buffer, a, b)
 end
 
 Base.show(io::IO, f::WritableClosure) = print(io, "WritableClosure($(f.f))")
 
 function (mc::WritableClosure{:out})(x)
-    mc.x_buffer[1] = copy(x)
-    mc.y_buffer[1] = mc.f(x)
-    return copy(mc.y_buffer[1])
+    (; f, x_buffer, y_buffer, a, b) = mc
+    x_buffer[1] = copy(x)
+    y_buffer[1] = (a + only(b)) * f(x)
+    return copy(y_buffer[1])
 end
 
 function (mc::WritableClosure{:in})(y, x)
-    mc.x_buffer[1] = copy(x)
-    mc.f(mc.y_buffer[1], mc.x_buffer[1])
-    copyto!(y, mc.y_buffer[1])
+    (; f, x_buffer, y_buffer, a, b) = mc
+    x_buffer[1] = copy(x)
+    f(y_buffer[1], x_buffer[1])
+    y_buffer[1] .*= (a + only(b))
+    copyto!(y, y_buffer[1])
     return nothing
 end
 
@@ -132,13 +137,25 @@ end
 
 Return a new `Scenario` identical to `scen` except for the function `f` which is made to close over differentiable data.
 """
-function closurify(scen::Scenario)
+function closurify(scen::Scenario{op,pl_op,pl_fun}) where {op,pl_op,pl_fun}
     (; f, x, y) = scen
     @assert isempty(scen.contexts)
     x_buffer = [zero(x)]
     y_buffer = [zero(y)]
-    closure_f = WritableClosure{function_place(scen)}(f, x_buffer, y_buffer)
-    return change_function(scen, closure_f; keep_smaller=false)
+    a = 3.0
+    b = [4.0]
+    closure_f = WritableClosure{pl_fun}(f, x_buffer, y_buffer, a, b)
+    return Scenario{op,pl_op,pl_fun}(
+        closure_f;
+        x        = scen.x,
+        y        = mymultiply(scen.y, a + only(b)),
+        tang     = scen.tang,
+        contexts = scen.contexts,
+        res1     = mymultiply(scen.res1, a + only(b)),
+        res2     = mymultiply(scen.res2, a + only(b)),
+        smaller  = nothing,
+        name     = isnothing(scen.name) ? nothing : scen.name * " [closurified]",
+    )
 end
 
 struct MultiplyByConstant{pl_fun,F} <: FunctionModifier
@@ -267,7 +284,8 @@ end
 
 function (sc::MultiplyByConstantAndStoreInCache{:out})(x, constantorcache)
     (; constant, cache) = constantorcache
-    y = constant * sc.f(x)
+    (; a, b) = constant
+    y = (a + only(b)) * sc.f(x)
     if eltype(y) == eltype(cache)
         newcache = cache
     else
@@ -285,6 +303,7 @@ end
 
 function (sc::MultiplyByConstantAndStoreInCache{:in})(y, x, constantorcache)
     (; constant, cache) = constantorcache
+    (; a, b) = constant
     if eltype(y) == eltype(cache)
         newcache = cache
     else
@@ -292,7 +311,7 @@ function (sc::MultiplyByConstantAndStoreInCache{:in})(y, x, constantorcache)
         newcache = similar(cache, eltype(y))
     end
     sc.f(newcache, x)
-    newcache .*= constant
+    newcache .*= (a + only(b))
     copyto!(y, newcache)
     return nothing
 end
@@ -307,19 +326,20 @@ function constantorcachify(scen::Scenario{op,pl_op,pl_fun}) where {op,pl_op,pl_f
     @assert isempty(scen.contexts)
     constantorcache_f = MultiplyByConstantAndStoreInCache{pl_fun}(f)
     a = 3.0
+    b = [4.0]
     constantorcache = if scen.y isa Number
-        (; cache=[myzero(scen.y)], constant=a)
+        (; cache=[myzero(scen.y)], constant=(; a, b))
     else
-        (; cache=mysimilar(scen.y), constant=a)
+        (; cache=mysimilar(scen.y), constant=(; a, b))
     end
     return Scenario{op,pl_op,pl_fun}(
         constantorcache_f;
         x=scen.x,
-        y=mymultiply(scen.y, a),
+        y=mymultiply(scen.y, a + only(b)),
         tang=scen.tang,
         contexts=(ConstantOrCache(constantorcache),),
-        res1=mymultiply(scen.res1, a),
-        res2=mymultiply(scen.res2, a),
+        res1=mymultiply(scen.res1, a + only(b)),
+        res2=mymultiply(scen.res2, a + only(b)),
         smaller=isnothing(scen.smaller) ? nothing : constantorcachify(scen.smaller),
         name=isnothing(scen.name) ? nothing : scen.name * " [constantorcachified]",
     )