diff --git a/DifferentiationInterface/CHANGELOG.md b/DifferentiationInterface/CHANGELOG.md
index 613d27a68..d2e87dce4 100644
--- a/DifferentiationInterface/CHANGELOG.md
+++ b/DifferentiationInterface/CHANGELOG.md
@@ -5,7 +5,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterface-v0.7.11...main)
+## [Unreleased](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterface-v0.7.12...main)
+
+## [0.7.12](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterface-v0.7.11...DifferentiationInterface-v0.7.12)
+
+### Added
+
+- Better documentation on argument assumptions ([#917](https://github.com/JuliaDiff/DifferentiationInterface.jl/pull/917))
+
+### Fixed
+
+- Speed up Mooncake in forward mode by preallocating tangents ([#915](https://github.com/JuliaDiff/DifferentiationInterface.jl/pull/915))
+- Speed up Mooncake reverse mode with selective zeroing ([#916](https://github.com/JuliaDiff/DifferentiationInterface.jl/pull/916))  
 
 ## [0.7.11](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterface-v0.7.10...DifferentiationInterface-v0.7.11)
 
diff --git a/DifferentiationInterface/Project.toml b/DifferentiationInterface/Project.toml
index e014b3a67..ded9bd6c3 100644
--- a/DifferentiationInterface/Project.toml
+++ b/DifferentiationInterface/Project.toml
@@ -1,7 +1,7 @@
 name = "DifferentiationInterface"
 uuid = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 authors = ["Guillaume Dalle", "Adrian Hill"]
-version = "0.7.11"
+version = "0.7.12"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -74,7 +74,7 @@ PolyesterForwardDiff = "0.1.2"
 ReverseDiff = "1.15.1"
 SparseArrays = "1"
 SparseConnectivityTracer = "0.6.14, 1"
-SparseMatrixColorings = "0.4.9"
+SparseMatrixColorings = "0.4.23"
 StaticArrays = "1.9.7"
 Symbolics = "5.27.1, 6, 7"
 Tracker = "0.2.33"
diff --git a/DifferentiationInterface/docs/make.jl b/DifferentiationInterface/docs/make.jl
index fae4d50de..334b5b83e 100644
--- a/DifferentiationInterface/docs/make.jl
+++ b/DifferentiationInterface/docs/make.jl
@@ -27,13 +27,14 @@ makedocs(;
     pages = [
         "Home" => "index.md",
         "Tutorials" => ["tutorials/basic.md", "tutorials/advanced.md"],
+        "api.md",
         "Explanation" => [
+            "explanation/arguments.md",
             "explanation/operators.md",
             "explanation/backends.md",
             "explanation/advanced.md",
         ],
         "FAQ" => ["faq/limitations.md", "faq/differentiability.md"],
-        "api.md",
         "Development" => [
             "dev/internals.md",
             "dev/math.md",
diff --git a/DifferentiationInterface/docs/src/explanation/advanced.md b/DifferentiationInterface/docs/src/explanation/advanced.md
index 5aa699291..fc4f3c651 100644
--- a/DifferentiationInterface/docs/src/explanation/advanced.md
+++ b/DifferentiationInterface/docs/src/explanation/advanced.md
@@ -1,44 +1,12 @@
 # Advanced features
 
-## Contexts
-
-### Additional arguments
-
-For all operators provided DifferentiationInterface, there can be only one differentiated (or "active") argument, which we call `x`.
-However, the release v0.6 introduced the possibility of additional "context" arguments, which are not differentiated but still passed to the function after `x`.
-
-Contexts can be useful if you have a function `y = f(x, a, b, c, ...)` or `f!(y, x, a, b, c, ...)` and you want derivatives of `y` with respect to `x` only.
-Another option would be creating a closure, but that is sometimes undesirable.
-
-### Types of contexts
-
-Every context argument must be wrapped in a subtype of [`Context`](@ref) and come after the differentiated input `x`.
-Right now, there are two kinds of context: [`Constant`](@ref) and [`Cache`](@ref).
-
-!!! warning
-    
-    Not every backend supports every type of context. See the documentation on [Backends](@ref) for more details.
-
-Semantically, both of these calls compute the partial gradient of `f(x, c)` with respect to `x`, but they consider `c` differently:
-
-```julia
-gradient(f, backend, x, Constant(c))
-gradient(f, backend, x, Cache(c))
-```
-
-In the first call, `c` is kept unchanged throughout the function evaluation.
-In the second call, `c` can be mutated with values computed during the function.
-
-Importantly, one can prepare an operator with an arbitrary value `c'` of the `Constant` (subject to the usual restrictions on preparation).
-The values in a provided `Cache` never matter anyway.
-
 ## Sparsity
 
 When faced with sparse Jacobian or Hessian matrices, one can take advantage of their sparsity pattern to speed up the computation.
 DifferentiationInterface does this automatically if you pass a backend of type [`AutoSparse`](@extref ADTypes.AutoSparse).
 
 !!! tip
-    
+
     To know more about sparse AD, read the survey [_What Color Is Your Jacobian? Graph Coloring for Computing Derivatives_](https://epubs.siam.org/doi/10.1137/S0036144504444711) (Gebremedhin et al., 2005).
 
 ### `AutoSparse` object
@@ -48,29 +16,32 @@ An `AutoSparse` backend must be constructed from three ingredients:
 
  1. An underlying (dense) backend, which can be [`SecondOrder`](@ref) or anything from [ADTypes.jl](https://github.com/SciML/ADTypes.jl)
 
- 2. A sparsity pattern detector like:
-    
+ 2. A sparsity pattern detector following the [`ADTypes.AbstractSparsityDetector`](@extref ADTypes.AbstractSparsityDetector) interface, such as:
+
       + [`TracerSparsityDetector`](@extref SparseConnectivityTracer.TracerSparsityDetector) from [SparseConnectivityTracer.jl](https://github.com/adrhill/SparseConnectivityTracer.jl)
       + [`SymbolicsSparsityDetector`](@extref Symbolics.SymbolicsSparsityDetector) from [Symbolics.jl](https://github.com/JuliaSymbolics/Symbolics.jl)
       + [`DenseSparsityDetector`](@ref) from DifferentiationInterface.jl (beware that this detector only gives a locally valid pattern)
       + [`KnownJacobianSparsityDetector`](@extref ADTypes.KnownJacobianSparsityDetector) or [`KnownHessianSparsityDetector`](@extref ADTypes.KnownHessianSparsityDetector) from [ADTypes.jl](https://github.com/SciML/ADTypes.jl) (if you already know the pattern)
- 3. A coloring algorithm from [SparseMatrixColorings.jl](https://github.com/gdalle/SparseMatrixColorings.jl), such as:
-    
-      + [`GreedyColoringAlgorithm`](@extref SparseMatrixColorings.GreedyColoringAlgorithm) (our generic recommendation)
+
+ 3. A coloring algorithm following the [`ADTypes.AbstractColoringAlgorithm`](@extref ADTypes.AbstractColoringAlgorithm) interface, such as those from [SparseMatrixColorings.jl](https://github.com/gdalle/SparseMatrixColorings.jl):
+
+      + [`GreedyColoringAlgorithm`](@extref SparseMatrixColorings.GreedyColoringAlgorithm) (our generic recommendation, don't forget to tune the `order` parameter)
       + [`ConstantColoringAlgorithm`](@extref SparseMatrixColorings.ConstantColoringAlgorithm) (if you have already computed the optimal coloring and always want to return it)
+      + [`OptimalColoringAlgorithm`](@extref SparseMatrixColorings.OptimalColoringAlgorithm) (if you have a low-dimensional matrix for which you want to know the best possible coloring)
 
 !!! note
-    
+
     Symbolic backends have built-in sparsity handling, so `AutoSparse(AutoSymbolics())` and `AutoSparse(AutoFastDifferentiation())` do not need additional configuration for pattern detection or coloring.
 
-### Cost of sparse preparation
+### Reusing sparse preparation
 
 The preparation step of `jacobian` or `hessian` with an `AutoSparse` backend can be long, because it needs to detect the sparsity pattern and perform a matrix coloring.
 But after preparation, the more zeros are present in the matrix, the greater the speedup will be compared to dense differentiation.
 
 !!! danger
-    
+
     The result of preparation for an `AutoSparse` backend cannot be reused if the sparsity pattern changes.
+    In particular, during preparation, make sure to pick input and context values that do not give rise to exceptional patterns (e.g. with too many zeros because of a multiplication with a constant `c = 0`, which may then be non-zero later on). Random values are usually a better choice during sparse preparation.
 
 ### Tuning the coloring algorithm
 
diff --git a/DifferentiationInterface/docs/src/explanation/arguments.md b/DifferentiationInterface/docs/src/explanation/arguments.md
new file mode 100644
index 000000000..a4e1ddeec
--- /dev/null
+++ b/DifferentiationInterface/docs/src/explanation/arguments.md
@@ -0,0 +1,71 @@
+# Arguments
+
+## General guidelines
+
+### Function form
+
+DifferentiationInterface only computes derivatives for functions with one of two specific forms:
+
+```julia
+y = f(x, contexts...)  # out of place, returns `y`
+f!(y, x, contexts...)  # in place, returns `nothing`
+```
+
+In this notation:
+
+- `f` (or `f!`) is the differentiated function
+- `y` is the output
+- `x` is the input, the only "active" argument, which always comes first
+- `contexts` may contain additional, inactive arguments
+
+The quantities returned by the various [operators](@ref "Operators") always correspond to (partial) derivatives of `y` with respect to `x`.
+
+### Assumptions
+
+The package makes one central assumption on the behavior and implementation of `f` (or `f!`):
+
+!!! danger "Mutation rule"
+    Either an argument's provided value matters, or it can be mutated during the function call, but never both.
+
+This rule is declined as follows:
+
+- The provided value of `x` matters because we evaluate and differentiate `f` at point `x`. Therefore, `x` cannot be mutated by the function.
+- For in-place functions `f!`, the output `y` is meant to be overwritten. Hence, its provided (initial) value cannot matter, and it must be entirely overwritten.
+
+!!! warning
+    Whether or not the function object itself can be mutated is a tricky question, and support for this varies between backends.
+    When in doubt, try to avoid mutating functions and pass contexts instead.
+    In any case, DifferentiationInterface will assume that the recursive components (fields, subfields, etc.) of `f` or `f!` individually satisfy the same mutation rule: whenever the initial value matters, no mutation is allowed.
+
+## Contexts
+
+### Motivation
+
+As stated, there can be only one active argument, which we call `x`.
+However, version 0.6 of the package introduced the possibility of additional "context" arguments, whose derivatives we don't need to compute.
+Contexts can be useful if you have a function `y = f(x, a, b, c, ...)` or `f!(y, x, a, b, c, ...)` and you only want the derivative of `y` with respect to `x`.
+Another option would be creating a closure, but that is sometimes undesirable for performance reasons.
+
+Every context argument must be wrapped in a subtype of [`Context`](@ref) and come after the active argument `x`.
+
+### Context types
+
+There are three kinds of context: [`Constant`](@ref), [`Cache`](@ref) and the hybrid [`ConstantOrCache`](@ref).
+Those are also classified based on the mutation rule:
+
+- [`Constant`](@ref) contexts wrap data that influences the output of the function. Hence they cannot be mutated.
+- [`Cache`](@ref) contexts correspond to scratch spaces that can be mutated at will. Hence their provided value is arbitrary.
+- [`ConstantOrCache`](@ref) is a hybrid, whose recursive components (fields, subfields, etc.) must individually satisfy the assumptions of either `Constant` or `Cache`.
+
+Semantically, both of these calls compute the partial gradient of `f(x, c)` with respect to `x`, but they consider `c` differently:
+
+```julia
+gradient(f, backend, x, Constant(c))
+gradient(f, backend, x, Cache(c))
+```
+
+In the first call, `c` must be kept unchanged throughout the function evaluation.
+In the second call, `c` may be mutated with values computed during the function.
+
+!!! warning
+    Not every backend supports every type of context. See the documentation on [backends](@ref "Backends") for more details.
diff --git a/DifferentiationInterface/docs/src/explanation/backends.md b/DifferentiationInterface/docs/src/explanation/backends.md
index efc68b20b..0da5201a9 100644
--- a/DifferentiationInterface/docs/src/explanation/backends.md
+++ b/DifferentiationInterface/docs/src/explanation/backends.md
@@ -4,33 +4,33 @@
 
 We support the following dense backend choices from [ADTypes.jl](https://github.com/SciML/ADTypes.jl):
 
-  - [`AutoChainRules`](@extref ADTypes.AutoChainRules)
-  - [`AutoDiffractor`](@extref ADTypes.AutoDiffractor)
-  - [`AutoEnzyme`](@extref ADTypes.AutoEnzyme)
-  - [`AutoFastDifferentiation`](@extref ADTypes.AutoFastDifferentiation)
-  - [`AutoFiniteDiff`](@extref ADTypes.AutoFiniteDiff)
-  - [`AutoFiniteDifferences`](@extref ADTypes.AutoFiniteDifferences)
-  - [`AutoForwardDiff`](@extref ADTypes.AutoForwardDiff)
-  - [`AutoGTPSA`](@extref ADTypes.AutoGTPSA)
-  - [`AutoMooncake`](@extref ADTypes.AutoMooncake) and [`AutoMooncakeForward`](@extref ADTypes.AutoMooncake) (the latter is experimental)
-  - [`AutoPolyesterForwardDiff`](@extref ADTypes.AutoPolyesterForwardDiff)
-  - [`AutoReverseDiff`](@extref ADTypes.AutoReverseDiff)
-  - [`AutoSymbolics`](@extref ADTypes.AutoSymbolics)
-  - [`AutoTracker`](@extref ADTypes.AutoTracker)
-  - [`AutoZygote`](@extref ADTypes.AutoZygote)
+- [`AutoChainRules`](@extref ADTypes.AutoChainRules)
+- [`AutoDiffractor`](@extref ADTypes.AutoDiffractor)
+- [`AutoEnzyme`](@extref ADTypes.AutoEnzyme)
+- [`AutoFastDifferentiation`](@extref ADTypes.AutoFastDifferentiation)
+- [`AutoFiniteDiff`](@extref ADTypes.AutoFiniteDiff)
+- [`AutoFiniteDifferences`](@extref ADTypes.AutoFiniteDifferences)
+- [`AutoForwardDiff`](@extref ADTypes.AutoForwardDiff)
+- [`AutoGTPSA`](@extref ADTypes.AutoGTPSA)
+- [`AutoMooncake`](@extref ADTypes.AutoMooncake) and [`AutoMooncakeForward`](@extref ADTypes.AutoMooncake) (the latter is experimental)
+- [`AutoPolyesterForwardDiff`](@extref ADTypes.AutoPolyesterForwardDiff)
+- [`AutoReverseDiff`](@extref ADTypes.AutoReverseDiff)
+- [`AutoSymbolics`](@extref ADTypes.AutoSymbolics)
+- [`AutoTracker`](@extref ADTypes.AutoTracker)
+- [`AutoZygote`](@extref ADTypes.AutoZygote)
 
 ## Features
 
 Given a backend object, you can use:
 
-  - [`check_available`](@ref) to know whether the required AD package is loaded
-  - [`check_inplace`](@ref) to know whether the backend supports in-place functions (all backends support out-of-place functions)
+- [`check_available`](@ref) to know whether the required AD package is loaded
+- [`check_inplace`](@ref) to know whether the backend supports in-place functions (all backends support out-of-place functions)
 
 In theory, all we need from each backend is either a `pushforward` or a `pullback`: we can deduce every other operator from these two.
 In practice, many AD backends have custom implementations for high-level operators like `gradient` or `jacobian`, which we reuse whenever possible.
 
 !!! details
-    
+
     In the rough summary table below,
     
       - ✅ means that we reuse the custom implementation from the backend;
@@ -90,7 +90,7 @@ The inner backend will be called first, and the outer backend will differentiate
 In general, using a forward outer backend over a reverse inner backend will yield the best performance.
 
 !!! danger
-    
+
     Second-order AD is tricky, and many backend combinations will fail (even if you combine a backend with itself).
     Be ready to experiment and open issues if necessary.
 
@@ -99,6 +99,7 @@ In general, using a forward outer backend over a reverse inner backend will yiel
 The wrapper [`DifferentiateWith`](@ref) allows you to switch between backends.
 It takes a function `f` and specifies that `f` should be differentiated with the substitute backend of your choice, instead of whatever true backend the surrounding code is trying to use.
 In other words, when someone tries to differentiate `dw = DifferentiateWith(f, substitute_backend)` with `true_backend`, then `substitute_backend` steps in and `true_backend` does not dive into the function `f` itself.
+
 At the moment, `DifferentiateWith` only works when `true_backend` is either [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl), reverse-mode [Mooncake.jl](https://github.com/chalk-lab/Mooncake.jl), or a [ChainRules.jl](https://github.com/JuliaDiff/ChainRules.jl)-compatible backend (e.g., [Zygote.jl](https://github.com/FluxML/Zygote.jl)).
 
 ## Implementations
@@ -117,7 +118,7 @@ Same-point preparation runs the forward sweep and returns the pullback closure.
 We only implement `pushforward`.
 
 !!! danger
-    
+
     The latest releases of Diffractor [broke DifferentiationInterface](https://github.com/JuliaDiff/Diffractor.jl/issues/290).
 
 ### Enzyme
@@ -126,7 +127,7 @@ Depending on the `mode` attribute inside [`AutoEnzyme`](@extref ADTypes.AutoEnzy
 When necessary, preparation chooses a number of chunks (for `gradient` and `jacobian` in forward mode, for `jacobian` only in reverse mode).
 
 !!! warning
-    
+
     Enzyme.jl's handling of activities and multiple arguments is not fully supported here, which can cause slowdowns or errors.
     If differentiation fails or takes too long, consider using Enzyme.jl through its [native API](https://enzymead.github.io/Enzyme.jl/stable/) instead.
 
@@ -135,7 +136,7 @@ When necessary, preparation chooses a number of chunks (for `gradient` and `jaco
 For every operator, preparation generates an [executable function](https://brianguenter.github.io/FastDifferentiation.jl/stable/makefunction/) from the symbolic expression of the differentiated function.
 
 !!! warning
-    
+
     Preparation can be very slow for symbolic AD.
 
 ### FiniteDiff
@@ -159,7 +160,7 @@ For all operators, preparation preallocates the input [`TPS`s](https://bmad-sim.
 If a GTPSA [`Descriptor`](https://bmad-sim.github.io/GTPSA.jl/stable/man/b_descriptor/) is not provided to `AutoGTPSA`, then a `Descriptor` will be generated in preparation based on the context.
 
 !!! danger
-    
+
     When providing a custom GTPSA `Descriptor` to `AutoGTPSA`, it is the responsibility of the user to ensure that the number of [GTPSA "variables"](https://bmad-sim.github.io/GTPSA.jl/stable/quickstart/#Calculating-a-Truncated-Power-Series) specified in the `Descriptor` is consistent with the number of inputs of the provided function. Undefined behavior and crashes may occur if this is not the case.
 
 ### PolyesterForwardDiff
@@ -175,7 +176,7 @@ This tape is computed from the input `x` provided at preparation time.
 It is control-flow dependent, so only one branch is recorded at each `if` statement.
 
 !!! danger
-    
+
     If your function has value-specific control flow (like `if x[1] > 0` or `if c == 1`), you may get silently wrong results whenever it takes new branches that were not taken during preparation.
     You must make sure to run preparation with an input and contexts whose values trigger the correct control flow for future executions.
 
@@ -186,7 +187,7 @@ Whenever contexts are provided, tape recording is deactivated in all cases, beca
 For all operators, preparation generates an [executable function](https://docs.sciml.ai/Symbolics/stable/manual/build_function/) from the symbolic expression of the differentiated function.
 
 !!! warning
-    
+
     Preparation can be very slow for symbolic AD.
 
 ### Mooncake
diff --git a/DifferentiationInterface/docs/src/explanation/operators.md b/DifferentiationInterface/docs/src/explanation/operators.md
index 8ff5efe0e..9664a03a4 100644
--- a/DifferentiationInterface/docs/src/explanation/operators.md
+++ b/DifferentiationInterface/docs/src/explanation/operators.md
@@ -1,21 +1,21 @@
 # Operators
 
 !!! tip
-    
+
     If there are some concepts you do not understand, take a look at the book [_The Elements of Differentiable Programming_](https://arxiv.org/abs/2403.14606) (Blondel and Roulet, 2024).
 
 ## List of operators
 
 Given a function `f(x) = y`, there are several differentiation operators available. The terminology depends on:
 
-  - the type and shape of the input `x`
-  - the type and shape of the output `y`
-  - the order of differentiation
+- the type and shape of the input `x`
+- the type and shape of the output `y`
+- the order of differentiation
 
 Below we list and describe all the operators we support.
 
 !!! warning
-    
+
     The package is thoroughly tested with inputs and outputs of the following types: `Float64`, `Vector{Float64}` and `Matrix{Float64}`.
     We also expect it to work on most kinds of `Number` and `AbstractArray` variables.
     Beyond that, you are in uncharted territory.
@@ -48,8 +48,8 @@ You can think of tangents as perturbations propagated through the function; they
 
 Several variants of each operator are defined:
 
-  - out-of-place operators return a new derivative object
-  - in-place operators mutate the provided derivative object
+- out-of-place operators return a new derivative object
+- in-place operators mutate the provided derivative object
 
 | out-of-place                | in-place                     | out-of-place + primal                            | in-place + primal                                 |
 |:--------------------------- |:---------------------------- |:------------------------------------------------ |:------------------------------------------------- |
@@ -66,11 +66,11 @@ Several variants of each operator are defined:
 
 Two kinds of functions are supported:
 
-  - out-of-place functions `f(x) = y`
-  - in-place functions `f!(y, x) = nothing`
+- out-of-place functions `f(x) = y`
+- in-place functions `f!(y, x) = nothing`
 
 !!! warning
-    
+
     In-place functions only work with [`pushforward`](@ref), [`pullback`](@ref), [`derivative`](@ref) and [`jacobian`](@ref).
     The other operators [`hvp`](@ref), [`gradient`](@ref) and [`hessian`](@ref) require scalar outputs, so it makes no sense to mutate the number `y`.
 
@@ -82,7 +82,7 @@ This results in various operator signatures (the necessary arguments and their o
 | in-place function `f!`    | `op(f!, y, backend, x, [t])`             | `op!(f!, y, result, backend, x, [t])` |
 
 !!! warning
-    
+
     The positional arguments between `f`/`f!` and `backend` are always mutated, regardless of the bang `!` in the operator name.
     In particular, for in-place functions `f!(y, x)`, every variant of every operator will mutate `y`.
 
@@ -120,7 +120,7 @@ op(f, prep, backend, x, [t])  # fast because it skips preparation
 ```
 
 !!! warning
-    
+
     The `prep` object is the last argument before `backend` and it is always mutated, regardless of the bang `!` in the operator name.
     As a consequence, preparation is **not thread-safe** and  sharing `prep` objects between threads may lead to unexpected behavior. If you need to run differentiation concurrently, prepare separate `prep` objects for each thread.
 
@@ -141,22 +141,25 @@ op(f, prep, [other_y], backend, other_x, [other_t, other_contexts...])
 
 provided that the following conditions all hold:
 
-  - `f` and `backend` remain the same
-  - `other_x` has the same type and size as `x`
-  - `other_y` has the same type and size as `y`
-  - `other_t` has the same type and size as `t`
-  - all the elements of `other_contexts` have the same type and size as the corresponding elements of `contexts`
+- `f` and `backend` remain the same
+- `other_x` has the same type and size as `x`
+- `other_y` has the same type and size as `y`
+- `other_t` has the same type and size as `t`
+- all the elements of `other_contexts` have the same type and size as the corresponding elements of `contexts`
 
 For same-point preparation, the same rules hold with two modifications:
 
-  - `other_x` must be _equal_ to `x`
-  - any element of `other_contexts` with type `Constant` must be _equal_ to the corresponding element of `contexts`
+- `other_x` must be _equal_ to `x`
+- any element of `other_contexts` with type `Constant`, as well as any constant parts of `ConstantOrCache`, must be _equal_ to the corresponding element of `contexts`
+
+Therein lies the key difference between same-point and different-point preparation: in the latter, input and context values are allowed to differ.
 
 !!! danger
-    
-    Reusing preparation with different types or sizes may work with some backends and error with others, so it is not allowed by the API of DifferentiationInterface.
+
+    Reusing preparation with different types or sizes may work with some backends and error with others, so it is not allowed by default.
+    To circumvent this limitation, you can pass `strict=Val(false)` to the preparation functions, but do so at your own risk.
 
 !!! warning
-    
+
     These rules hold for the majority of backends, but there are some exceptions.
     The most important exception is [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) and its taping mechanism, which is sensitive to control flow inside the function.
diff --git a/DifferentiationInterface/docs/src/faq/differentiability.md b/DifferentiationInterface/docs/src/faq/differentiability.md
index 7c6da62bc..957ef6383 100644
--- a/DifferentiationInterface/docs/src/faq/differentiability.md
+++ b/DifferentiationInterface/docs/src/faq/differentiability.md
@@ -7,11 +7,11 @@ To make your functions compatible with several backends, you need to mind the re
 The list of backends available at [juliadiff.org](https://juliadiff.org/) is split into 2 main families: operator overloading and source transformation.
 Writing differentiable code requires a specific approach in each paradigm:
 
-  - For operator overloading, ensure type-genericity.
-  - For source transformation, rely on existing rules or write your own.
+- For operator overloading, ensure type-genericity.
+- For source transformation, rely on existing rules or write your own.
 
 !!! tip
-    
+
     Depending on your intended use case, you may not need to ensure compatibility with every single backend.
     In particular, some applications strongly suggest a specific "mode" of AD (forward or reverse), in which case backends limited to the other mode are mostly irrelevant.
 
@@ -32,14 +32,14 @@ MethodError: no method matching Float64(::ForwardDiff.Dual{...})
 
 To prevent them, here are a few things to look out for:
 
-  - Avoid functions with overly specific type annotations.
+- Avoid functions with overly specific type annotations.
 
 ```julia
 f(x::Vector{Float64}) = x # bad
 f(x::AbstractVector{<:Real}) = x # good
 ```
 
-  - When creating new containers or buffers, adapt to the input number type if necessary.
+- When creating new containers or buffers, adapt to the input number type if necessary.
 
 ```julia
 tmp = zeros(length(x))  # bad
@@ -93,10 +93,10 @@ Its [rule system](https://chalk-lab.github.io/Mooncake.jl/stable/understanding_m
 
 To summarize, here are the main rule systems which coexist at the moment:
 
-  - `Dual` numbers in ForwardDiff.jl
-  - ChainRulesCore.jl
-  - Enzyme.jl
-  - Mooncake.jl
+- `Dual` numbers in ForwardDiff.jl
+- ChainRulesCore.jl
+- Enzyme.jl
+- Mooncake.jl
 
 ### Rule translation
 
@@ -105,9 +105,9 @@ ChainRulesCore.jl is the closest thing we have to a standard, but it does not ha
 As a result, Enzyme.jl and Mooncake.jl both rolled out their own designs, which are not mutually compatible.
 There are, however, translation utilities:
 
-  - from ChainRulesCore.jl to ForwardDiff.jl with [ForwardDiffChainRules.jl](https://github.com/ThummeTo/ForwardDiffChainRules.jl)
-  - from ChainRulesCore.jl to Enzyme.jl with [`Enzyme.@import_rrule`](https://enzymead.github.io/Enzyme.jl/stable/api/#Enzyme.@import_rrule-Tuple)
-  - from ChainRulesCore.jl to Mooncake.jl with [`Mooncake.@from_rrule`](https://chalk-lab.github.io/Mooncake.jl/stable/utilities/defining_rules/#Using-ChainRules.jl)
+- from ChainRulesCore.jl to ForwardDiff.jl with [ForwardDiffChainRules.jl](https://github.com/ThummeTo/ForwardDiffChainRules.jl)
+- from ChainRulesCore.jl to Enzyme.jl with [`Enzyme.@import_rrule`](https://enzymead.github.io/Enzyme.jl/stable/api/#Enzyme.@import_rrule-Tuple)
+- from ChainRulesCore.jl to Mooncake.jl with [`Mooncake.@from_rrule`](https://chalk-lab.github.io/Mooncake.jl/stable/utilities/defining_rules/#Using-ChainRules.jl)
 
 ### Backend switch
 
diff --git a/DifferentiationInterface/docs/src/tutorials/advanced.md b/DifferentiationInterface/docs/src/tutorials/advanced.md
index 0751f0e22..8a5497afb 100644
--- a/DifferentiationInterface/docs/src/tutorials/advanced.md
+++ b/DifferentiationInterface/docs/src/tutorials/advanced.md
@@ -57,7 +57,7 @@ For additional arguments which act as mutated buffers, the [`Cache`](@ref) wrapp
 ## Sparsity
 
 !!! tip
-    
+
     If you use DifferentiationInterface's Sparse AD functionality in your research,
     please cite our preprint [*Sparser, Better, Faster, Stronger: Efficient Automatic Differentiation for Sparse Jacobians and Hessians*](https://arxiv.org/abs/2501.17737).
 
diff --git a/DifferentiationInterfaceTest/CHANGELOG.md b/DifferentiationInterfaceTest/CHANGELOG.md
index bd5216844..b314043ee 100644
--- a/DifferentiationInterfaceTest/CHANGELOG.md
+++ b/DifferentiationInterfaceTest/CHANGELOG.md
@@ -5,7 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterfaceTest-v0.10.3...main)
+## [Unreleased](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterfaceTest-v0.10.4...main)
+
+## [0.10.4](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterfaceTest-v0.10.3...DifferentiationInterfaceTest-v0.10.4)
+
+### Removed
+
+- Remove neural network tests ([#914](https://github.com/JuliaDiff/DifferentiationInterface.jl/pull/914))
 
 ## [0.10.3](https://github.com/JuliaDiff/DifferentiationInterface.jl/compare/DifferentiationInterfaceTest-v0.10.2...DifferentiationInterfaceTest-v0.10.3)
 
diff --git a/DifferentiationInterfaceTest/Project.toml b/DifferentiationInterfaceTest/Project.toml
index bca102eda..a417d9eac 100644
--- a/DifferentiationInterfaceTest/Project.toml
+++ b/DifferentiationInterfaceTest/Project.toml
@@ -1,7 +1,7 @@
 name = "DifferentiationInterfaceTest"
 uuid = "a82114a7-5aa3-49a8-9643-716bb13727a3"
 authors = ["Guillaume Dalle", "Adrian Hill"]
-version = "0.10.3"
+version = "0.10.4"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"