From fa84113cc8cea90031d43a413cb309467808f828 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Mon, 1 Dec 2025 14:01:14 +0100
Subject: [PATCH 01/17] Pass custom_data through assemblers to kernel
 functions.

---
 cpp/dolfinx/fem/Form.h                   |  45 +++-
 cpp/dolfinx/fem/assemble_matrix_impl.h   |  25 +-
 cpp/dolfinx/fem/assemble_scalar_impl.h   |  22 +-
 cpp/dolfinx/fem/assemble_vector_impl.h   |  72 +++---
 python/dolfinx/wrappers/fem.cpp          |  16 ++
 python/test/unit/fem/test_custom_data.py | 287 +++++++++++++++++++++++
 6 files changed, 419 insertions(+), 48 deletions(-)
 create mode 100644 python/test/unit/fem/test_custom_data.py
diff --git a/cpp/dolfinx/fem/Form.h b/cpp/dolfinx/fem/Form.h
index 6bbb426e5c..c4519c6faf 100644
--- a/cpp/dolfinx/fem/Form.h
+++ b/cpp/dolfinx/fem/Form.h
@@ -55,6 +55,8 @@ struct integral_data
   /// @param[in] entities Indices of entities to integrate over.
   /// @param[in] coeffs Indices of the coefficients that are present
   /// (active) in `kernel`.
+  /// @param[in] custom_data Optional custom user data pointer passed to
+  /// the kernel function.
   template <typename K, typename V, typename W>
     requires std::is_convertible_v<
                  std::remove_cvref_t<K>,
@@ -64,9 +66,10 @@ struct integral_data
                                            std::vector<std::int32_t>>
                  and std::is_convertible_v<std::remove_cvref_t<W>,
                                            std::vector<int>>
-  integral_data(K&& kernel, V&& entities, W&& coeffs)
+  integral_data(K&& kernel, V&& entities, W&& coeffs,
+                void* custom_data = nullptr)
       : kernel(std::forward<K>(kernel)), entities(std::forward<V>(entities)),
-        coeffs(std::forward<W>(coeffs))
+        coeffs(std::forward<W>(coeffs)), custom_data(custom_data)
   {
   }
 
@@ -82,6 +85,11 @@ struct integral_data
   /// @brief Indices of coefficients (from the form) that are in this
   /// integral.
   std::vector<int> coeffs;
+
+  /// @brief Custom user data pointer passed to the kernel function.
+  /// This can be used to pass runtime-computed data (e.g., per-cell
+  /// quadrature rules, material properties) to the kernel.
+  void* custom_data = nullptr;
 };
 
 /// @brief A representation of finite element variational forms.
@@ -391,6 +399,39 @@ class Form
     return it->second.kernel;
   }
 
+  /// @brief Get the custom data pointer for an integral.
+  ///
+  /// The custom data pointer is passed to the kernel function during
+  /// assembly. This can be used to pass runtime-computed data to
+  /// kernels (e.g., per-cell quadrature rules, material properties).
+  ///
+  /// @param[in] type Integral type.
+  /// @param[in] id Integral subdomain ID.
+  /// @param[in] kernel_idx Index of the kernel (we may have multiple
+  /// kernels for a given ID in mixed-topology meshes).
+  /// @return Custom data pointer for the integral, or nullptr if not set.
+  void* custom_data(IntegralType type, int id, int kernel_idx) const
+  {
+    auto it = _integrals.find({type, id, kernel_idx});
+    if (it == _integrals.end())
+      throw std::runtime_error("Requested integral not found.");
+    return it->second.custom_data;
+  }
+
+  /// @brief Set the custom data pointer for an integral.
+  ///
+  /// @param[in] type Integral type.
+  /// @param[in] id Integral subdomain ID.
+  /// @param[in] kernel_idx Index of the kernel.
+  /// @param[in] data Custom data pointer to set.
+  void set_custom_data(IntegralType type, int id, int kernel_idx, void* data)
+  {
+    auto it = _integrals.find({type, id, kernel_idx});
+    if (it == _integrals.end())
+      throw std::runtime_error("Requested integral not found.");
+    it->second.custom_data = data;
+  }
+
   /// @brief Get types of integrals in the form.
   /// @return Integrals types.
   std::set<IntegralType> integral_types() const
diff --git a/cpp/dolfinx/fem/assemble_matrix_impl.h b/cpp/dolfinx/fem/assemble_matrix_impl.h
index 3b6148f9e9..99f3553a5b 100644
--- a/cpp/dolfinx/fem/assemble_matrix_impl.h
+++ b/cpp/dolfinx/fem/assemble_matrix_impl.h
@@ -60,6 +60,7 @@ using mdspan2_t = md::mdspan<const std::int32_t, md::dextents<std::size_t, 2>>;
 /// function mesh.
 /// @param cell_info1 Cell permutation information for the trial
 /// function mesh.
+/// @param custom_data Custom user data pointer passed to the kernel.
 template <dolfinx::scalar T>
 void assemble_cells_matrix(
     la::MatSet<T> auto mat_set, mdspan2_t x_dofmap,
@@ -74,7 +75,7 @@ void assemble_cells_matrix(
     std::span<const std::int8_t> bc1, FEkernel<T> auto kernel,
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     std::span<const T> constants, std::span<const std::uint32_t> cell_info0,
-    std::span<const std::uint32_t> cell_info1)
+    std::span<const std::uint32_t> cell_info1, void* custom_data = nullptr)
 {
   if (cells.empty())
     return;
@@ -109,7 +110,7 @@ void assemble_cells_matrix(
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(), nullptr,
-           nullptr, nullptr);
+           nullptr, custom_data);
 
     // Compute A = P_0 \tilde{A} P_1^T (dof transformation)
     P0(Ae, cell_info0, cell0, ndim1);  // B = P0 \tilde{A}
@@ -198,6 +199,7 @@ void assemble_cells_matrix(
 /// function mesh.
 /// @param[in] perms Entity permutation integer. Empty if entity
 /// permutations are not required.
+/// @param custom_data Custom user data pointer passed to the kernel.
 template <dolfinx::scalar T>
 void assemble_entities(
     la::MatSet<T> auto mat_set, mdspan2_t x_dofmap,
@@ -221,7 +223,8 @@ void assemble_entities(
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     std::span<const T> constants, std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1,
-    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms)
+    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
+    void* custom_data = nullptr)
 {
   if (entities.empty())
     return;
@@ -259,7 +262,7 @@ void assemble_entities(
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(f, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, nullptr);
+           &local_entity, &perm, custom_data);
     P0(Ae, cell_info0, cell0, ndim1);
     P1T(Ae, cell_info1, cell1, ndim0);
 
@@ -363,7 +366,8 @@ void assemble_interior_facets(
         coeffs,
     std::span<const T> constants, std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1,
-    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms)
+    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
+    void* custom_data = nullptr)
 {
   if (facets.empty())
     return;
@@ -440,7 +444,7 @@ void assemble_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     kernel(Ae.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), nullptr);
+           local_facet.data(), perm.data(), custom_data);
 
     // Local element layout is a 2x2 block matrix with structure
     //
@@ -605,12 +609,13 @@ void assemble_matrix(
       std::span cells0 = a.domain_arg(IntegralType::cell, 0, i, cell_type_idx);
       std::span cells1 = a.domain_arg(IntegralType::cell, 1, i, cell_type_idx);
       auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
+      void* custom_data = a.custom_data(IntegralType::cell, i, cell_type_idx);
       assert(cells.size() * cstride == coeffs.size());
       impl::assemble_cells_matrix(
           mat_set, x_dofmap, x, cells, {dofs0, bs0, cells0}, P0,
           {dofs1, bs1, cells1}, P1T, bc0, bc1, fn,
           md::mdspan(coeffs.data(), cells.size(), cstride), constants,
-          cell_info0, cell_info1);
+          cell_info0, cell_info1, custom_data);
     }
 
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> facet_perms;
@@ -646,6 +651,7 @@ void assemble_matrix(
       assert(fn);
       auto& [coeffs, cstride]
           = coefficients.at({IntegralType::interior_facet, i});
+      void* custom_data = a.custom_data(IntegralType::interior_facet, i, 0);
 
       std::span facets = a.domain(IntegralType::interior_facet, i, 0);
       std::span facets0 = a.domain_arg(IntegralType::interior_facet, 0, i, 0);
@@ -661,7 +667,7 @@ void assemble_matrix(
            mdspanx22_t(facets1.data(), facets1.size() / 4, 2, 2)},
           P1T, bc0, bc1, fn,
           mdspanx2x_t(coeffs.data(), facets.size() / 4, 2, cstride), constants,
-          cell_info0, cell_info1, facet_perms);
+          cell_info0, cell_info1, facet_perms, custom_data);
     }
 
     for (auto itg_type : {fem::IntegralType::exterior_facet,
@@ -688,6 +694,7 @@ void assemble_matrix(
         auto fn = a.kernel(itg_type, i, 0);
         assert(fn);
         auto& [coeffs, cstride] = coefficients.at({itg_type, i});
+        void* custom_data = a.custom_data(itg_type, i, 0);
 
         std::span e = a.domain(itg_type, i, 0);
         mdspanx2_t entities(e.data(), e.size() / 2, 2);
@@ -700,7 +707,7 @@ void assemble_matrix(
             mat_set, x_dofmap, x, entities, {dofs0, bs0, entities0}, P0,
             {dofs1, bs1, entities1}, P1T, bc0, bc1, fn,
             md::mdspan(coeffs.data(), entities.extent(0), cstride), constants,
-            cell_info0, cell_info1, perms);
+            cell_info0, cell_info1, perms, custom_data);
       }
     }
   }
diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 37166b46ea..88f5187b3f 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -30,7 +30,8 @@ T assemble_cells(mdspan2_t x_dofmap,
                  std::span<const std::int32_t> cells, FEkernel<T> auto fn,
                  std::span<const T> constants,
                  md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
-                 std::span<scalar_value_t<T>> cdofs_b)
+                 std::span<scalar_value_t<T>> cdofs_b,
+                 void* custom_data = nullptr)
 {
   T value(0);
   if (cells.empty())
@@ -49,7 +50,7 @@ T assemble_cells(mdspan2_t x_dofmap,
       std::copy_n(&x(x_dofs[i], 0), 3, std::next(cdofs_b.begin(), 3 * i));
 
     fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), nullptr,
-       nullptr, nullptr);
+       nullptr, custom_data);
   }
 
   return value;
@@ -77,7 +78,7 @@ T assemble_entities(
     FEkernel<T> auto fn, std::span<const T> constants,
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    std::span<scalar_value_t<T>> cdofs_b)
+    std::span<scalar_value_t<T>> cdofs_b, void* custom_data = nullptr)
 {
   T value(0);
   if (entities.empty())
@@ -99,7 +100,7 @@ T assemble_entities(
     // Permutations
     std::uint8_t perm = perms.empty() ? 0 : perms(cell, local_entity);
     fn(&value, &coeffs(f, 0), constants.data(), cdofs_b.data(), &local_entity,
-       &perm, nullptr);
+       &perm, custom_data);
   }
 
   return value;
@@ -120,7 +121,7 @@ T assemble_interior_facets(
                                     md::dynamic_extent>>
         coeffs,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    std::span<scalar_value_t<T>> cdofs_b)
+    std::span<scalar_value_t<T>> cdofs_b, void* custom_data = nullptr)
 {
   T value(0);
   if (facets.empty())
@@ -150,7 +151,7 @@ T assemble_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     fn(&value, &coeffs(f, 0, 0), constants.data(), cdofs_b.data(),
-       local_facet.data(), perm.data(), nullptr);
+       local_facet.data(), perm.data(), custom_data);
   }
 
   return value;
@@ -178,11 +179,12 @@ T assemble_scalar(
     auto fn = M.kernel(IntegralType::cell, i, 0);
     assert(fn);
     auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
+    void* custom_data = M.custom_data(IntegralType::cell, i, 0);
     std::span<const std::int32_t> cells = M.domain(IntegralType::cell, i, 0);
     assert(cells.size() * cstride == coeffs.size());
     value += impl::assemble_cells(
         x_dofmap, x, cells, fn, constants,
-        md::mdspan(coeffs.data(), cells.size(), cstride), cdofs_b);
+        md::mdspan(coeffs.data(), cells.size(), cstride), cdofs_b, custom_data);
   }
 
   mesh::CellType cell_type = mesh->topology()->cell_type();
@@ -204,6 +206,7 @@ T assemble_scalar(
     assert(fn);
     auto& [coeffs, cstride]
         = coefficients.at({IntegralType::interior_facet, i});
+    void* custom_data = M.custom_data(IntegralType::interior_facet, i, 0);
     std::span facets = M.domain(IntegralType::interior_facet, i, 0);
 
     constexpr std::size_t num_adjacent_cells = 2;
@@ -220,7 +223,7 @@ T assemble_scalar(
         md::mdspan<const T, md::extents<std::size_t, md::dynamic_extent, 2,
                                         md::dynamic_extent>>(
             coeffs.data(), facets.size() / shape1, 2, cstride),
-        facet_perms, cdofs_b);
+        facet_perms, cdofs_b, custom_data);
   }
 
   for (auto itg_type : {fem::IntegralType::exterior_facet,
@@ -236,6 +239,7 @@ T assemble_scalar(
       auto fn = M.kernel(itg_type, i, 0);
       assert(fn);
       auto& [coeffs, cstride] = coefficients.at({itg_type, i});
+      void* custom_data = M.custom_data(itg_type, i, 0);
 
       std::span entities = M.domain(itg_type, i, 0);
 
@@ -248,7 +252,7 @@ T assemble_scalar(
               entities.data(), entities.size() / 2, 2),
           fn, constants,
           md::mdspan(coeffs.data(), entities.size() / 2, cstride), perms,
-          cdofs_b);
+          cdofs_b, custom_data);
     }
   }
 
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index f57e646dd9..de03eade63 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -92,7 +92,8 @@ void _lift_bc_cells(
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1, std::span<const T> bc_values1,
-    std::span<const std::int8_t> bc_markers1, std::span<const T> x0, T alpha)
+    std::span<const std::int8_t> bc_markers1, std::span<const T> x0, T alpha,
+    void* custom_data = nullptr)
 {
   if (cells.empty())
     return;
@@ -164,7 +165,7 @@ void _lift_bc_cells(
 
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           nullptr, nullptr, nullptr);
+           nullptr, nullptr, custom_data);
     P0(Ae, cell_info0, c0, num_cols);
     P1T(Ae, cell_info1, c1, num_rows);
 
@@ -286,7 +287,8 @@ void _lift_bc_entities(
     std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1, std::span<const T> bc_values1,
     std::span<const std::int8_t> bc_markers1, std::span<const T> x0, T alpha,
-    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms)
+    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
+    void* custom_data = nullptr)
 {
   if (entities.empty())
     return;
@@ -345,7 +347,7 @@ void _lift_bc_entities(
     std::uint8_t perm = perms.empty() ? 0 : perms(cell, local_entity);
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, nullptr);
+           &local_entity, &perm, custom_data);
     P0(Ae, cell_info0, cell0, num_cols);
     P1T(Ae, cell_info1, cell1, num_rows);
 
@@ -443,7 +445,8 @@ void _lift_bc_interior_facets(
     std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1, std::span<const T> bc_values1,
     std::span<const std::int8_t> bc_markers1, std::span<const T> x0, T alpha,
-    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms)
+    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
+    void* custom_data = nullptr)
 {
   if (facets.empty())
     return;
@@ -559,7 +562,7 @@ void _lift_bc_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     kernel(Ae.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), nullptr);
+           local_facet.data(), perm.data(), custom_data);
 
     if (cells0[0] >= 0)
       P0(Ae, cell_info0, cells0[0], num_cols);
@@ -671,7 +674,7 @@ void assemble_cells(
     std::tuple<mdspan2_t, int, std::span<const std::int32_t>> dofmap,
     FEkernel<T> auto kernel, std::span<const T> constants,
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
-    std::span<const std::uint32_t> cell_info0)
+    std::span<const std::uint32_t> cell_info0, void* custom_data = nullptr)
 {
   if (cells.empty())
     return;
@@ -698,7 +701,7 @@ void assemble_cells(
     // Tabulate vector for cell
     std::ranges::fill(be, 0);
     kernel(be.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           nullptr, nullptr, nullptr);
+           nullptr, nullptr, custom_data);
     P0(be, cell_info0, c0, 1);
 
     // Scatter cell vector to 'global' vector array
@@ -769,7 +772,8 @@ void assemble_entities(
     FEkernel<T> auto kernel, std::span<const T> constants,
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     std::span<const std::uint32_t> cell_info0,
-    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms)
+    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
+    void* custom_data = nullptr)
 {
   if (entities.empty())
     return;
@@ -801,7 +805,7 @@ void assemble_entities(
     // Tabulate element vector
     std::ranges::fill(be, 0);
     kernel(be.data(), &coeffs(f, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, nullptr);
+           &local_entity, &perm, custom_data);
     P0(be, cell_info0, cell0, 1);
 
     // Add element vector to global vector
@@ -866,7 +870,8 @@ void assemble_interior_facets(
                                     md::dynamic_extent>>
         coeffs,
     std::span<const std::uint32_t> cell_info0,
-    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms)
+    md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
+    void* custom_data = nullptr)
 {
   using X = scalar_value_t<T>;
 
@@ -918,7 +923,7 @@ void assemble_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     kernel(be.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), nullptr);
+           local_facet.data(), perm.data(), custom_data);
 
     if (cells0[0] >= 0)
       P0(be, cell_info0, cells0[0], 1);
@@ -1026,6 +1031,7 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
     auto kernel = a.kernel(IntegralType::cell, i, 0);
     assert(kernel);
     auto& [_coeffs, cstride] = coefficients.at({IntegralType::cell, i});
+    void* custom_data = a.custom_data(IntegralType::cell, i, 0);
     std::span cells = a.domain(IntegralType::cell, i, 0);
     std::span cells0 = a.domain_arg(IntegralType::cell, 0, i, 0);
     std::span cells1 = a.domain_arg(IntegralType::cell, 1, i, 0);
@@ -1036,20 +1042,21 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
       _lift_bc_cells<1, 1>(b, x_dofmap, x, kernel, cells,
                            {dofmap0, bs0, cells0}, P0, {dofmap1, bs1, cells1},
                            P1T, constants, coeffs, cell_info0, cell_info1,
-                           bc_values1, bc_markers1, x0, alpha);
+                           bc_values1, bc_markers1, x0, alpha, custom_data);
     }
     else if (bs0 == 3 and bs1 == 3)
     {
       _lift_bc_cells<3, 3>(b, x_dofmap, x, kernel, cells,
                            {dofmap0, bs0, cells0}, P0, {dofmap1, bs1, cells1},
                            P1T, constants, coeffs, cell_info0, cell_info1,
-                           bc_values1, bc_markers1, x0, alpha);
+                           bc_values1, bc_markers1, x0, alpha, custom_data);
     }
     else
     {
       _lift_bc_cells(b, x_dofmap, x, kernel, cells, {dofmap0, bs0, cells0}, P0,
                      {dofmap1, bs1, cells1}, P1T, constants, coeffs, cell_info0,
-                     cell_info1, bc_values1, bc_markers1, x0, alpha);
+                     cell_info1, bc_values1, bc_markers1, x0, alpha,
+                     custom_data);
     }
   }
 
@@ -1072,6 +1079,7 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
     assert(kernel);
     auto& [coeffs, cstride]
         = coefficients.at({IntegralType::interior_facet, i});
+    void* custom_data = a.custom_data(IntegralType::interior_facet, i, 0);
 
     using mdspanx22_t
         = md::mdspan<const std::int32_t,
@@ -1089,7 +1097,8 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
         b, x_dofmap, x, kernel, facets, {dofmap0, bs0, facets0}, P0,
         {dofmap1, bs1, facets1}, P1T, constants,
         mdspanx2x_t(coeffs.data(), facets.extent(0), 2, cstride), cell_info0,
-        cell_info1, bc_values1, bc_markers1, x0, alpha, facet_perms);
+        cell_info1, bc_values1, bc_markers1, x0, alpha, facet_perms,
+        custom_data);
   }
 
   for (auto itg_type : {fem::IntegralType::exterior_facet,
@@ -1105,6 +1114,7 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
       auto kernel = a.kernel(itg_type, i, 0);
       assert(kernel);
       auto& [coeffs, cstride] = coefficients.at({itg_type, i});
+      void* custom_data = a.custom_data(itg_type, i, 0);
 
       using mdspanx2_t
           = md::mdspan<const std::int32_t,
@@ -1120,7 +1130,7 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
           b, x_dofmap, x, kernel, entities, {dofmap0, bs0, entities0}, P0,
           {dofmap1, bs1, entities1}, P1T, constants,
           md::mdspan(coeffs.data(), entities.extent(0), cstride), cell_info0,
-          cell_info1, bc_values1, bc_markers1, x0, alpha, perms);
+          cell_info1, bc_values1, bc_markers1, x0, alpha, perms, custom_data);
     }
   }
 }
@@ -1276,24 +1286,28 @@ void assemble_vector(
       std::span cells = L.domain(IntegralType::cell, i, cell_type_idx);
       std::span cells0 = L.domain_arg(IntegralType::cell, 0, i, cell_type_idx);
       auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
+      void* custom_data = L.custom_data(IntegralType::cell, i, cell_type_idx);
       assert(cells.size() * cstride == coeffs.size());
       if (bs == 1)
       {
         impl::assemble_cells<1>(
             P0, b, x_dofmap, x, cells, {dofs, bs, cells0}, fn, constants,
-            md::mdspan(coeffs.data(), cells.size(), cstride), cell_info0);
+            md::mdspan(coeffs.data(), cells.size(), cstride), cell_info0,
+            custom_data);
       }
       else if (bs == 3)
       {
         impl::assemble_cells<3>(
             P0, b, x_dofmap, x, cells, {dofs, bs, cells0}, fn, constants,
-            md::mdspan(coeffs.data(), cells.size(), cstride), cell_info0);
+            md::mdspan(coeffs.data(), cells.size(), cstride), cell_info0,
+            custom_data);
       }
       else
       {
-        impl::assemble_cells(
-            P0, b, x_dofmap, x, cells, {dofs, bs, cells0}, fn, constants,
-            md::mdspan(coeffs.data(), cells.size(), cstride), cell_info0);
+        impl::assemble_cells(P0, b, x_dofmap, x, cells, {dofs, bs, cells0}, fn,
+                             constants,
+                             md::mdspan(coeffs.data(), cells.size(), cstride),
+                             cell_info0, custom_data);
       }
     }
 
@@ -1327,6 +1341,7 @@ void assemble_vector(
       assert(fn);
       auto& [coeffs, cstride]
           = coefficients.at({IntegralType::interior_facet, i});
+      void* custom_data = L.custom_data(IntegralType::interior_facet, i, 0);
       std::span facets = L.domain(IntegralType::interior_facet, i, 0);
       std::span facets1 = L.domain_arg(IntegralType::interior_facet, 0, i, 0);
       assert((facets.size() / 4) * 2 * cstride == coeffs.size());
@@ -1339,7 +1354,7 @@ void assemble_vector(
              mdspanx22_t(facets1.data(), facets1.size() / 4, 2, 2)},
             fn, constants,
             mdspanx2x_t(coeffs.data(), facets.size() / 4, 2, cstride),
-            cell_info0, facet_perms);
+            cell_info0, facet_perms, custom_data);
       }
       else if (bs == 3)
       {
@@ -1350,7 +1365,7 @@ void assemble_vector(
              mdspanx22_t(facets1.data(), facets1.size() / 4, 2, 2)},
             fn, constants,
             mdspanx2x_t(coeffs.data(), facets.size() / 4, 2, cstride),
-            cell_info0, facet_perms);
+            cell_info0, facet_perms, custom_data);
       }
       else
       {
@@ -1361,7 +1376,7 @@ void assemble_vector(
              mdspanx22_t(facets1.data(), facets1.size() / 4, 2, 2)},
             fn, constants,
             mdspanx2x_t(coeffs.data(), facets.size() / 4, 2, cstride),
-            cell_info0, facet_perms);
+            cell_info0, facet_perms, custom_data);
       }
     }
 
@@ -1378,6 +1393,7 @@ void assemble_vector(
         auto fn = L.kernel(itg_type, i, 0);
         assert(fn);
         auto& [coeffs, cstride] = coefficients.at({itg_type, i});
+        void* custom_data = L.custom_data(itg_type, i, 0);
         std::span e = L.domain(itg_type, i, 0);
         mdspanx2_t entities(e.data(), e.size() / 2, 2);
         std::span e1 = L.domain_arg(itg_type, 0, i, 0);
@@ -1388,7 +1404,7 @@ void assemble_vector(
           impl::assemble_entities<1>(
               P0, b, x_dofmap, x, entities, {dofs, bs, entities1}, fn,
               constants, md::mdspan(coeffs.data(), entities.extent(0), cstride),
-              cell_info0, perms);
+              cell_info0, perms, custom_data);
         }
         else if (bs == 3)
         {
@@ -1396,7 +1412,7 @@ void assemble_vector(
               P0, b, x_dofmap, x, entities, {dofs, bs, entities1}, fn,
               constants,
               md::mdspan(coeffs.data(), entities.size() / 2, cstride),
-              cell_info0, perms);
+              cell_info0, perms, custom_data);
         }
         else
         {
@@ -1404,7 +1420,7 @@ void assemble_vector(
               P0, b, x_dofmap, x, entities, {dofs, bs, entities1}, fn,
               constants,
               md::mdspan(coeffs.data(), entities.size() / 2, cstride),
-              cell_info0, perms);
+              cell_info0, perms, custom_data);
         }
       }
     }
diff --git a/python/dolfinx/wrappers/fem.cpp b/python/dolfinx/wrappers/fem.cpp
index 9e4b365722..52ec70091d 100644
--- a/python/dolfinx/wrappers/fem.cpp
+++ b/python/dolfinx/wrappers/fem.cpp
@@ -761,6 +761,22 @@ void declare_form(nb::module_& m, std::string type)
       .def_prop_ro("integral_types", &dolfinx::fem::Form<T, U>::integral_types)
       .def_prop_ro("needs_facet_permutations",
                    &dolfinx::fem::Form<T, U>::needs_facet_permutations)
+      .def(
+          "set_custom_data",
+          [](dolfinx::fem::Form<T, U>& self, dolfinx::fem::IntegralType type,
+             int id, int kernel_idx, std::uintptr_t data)
+          { self.set_custom_data(type, id, kernel_idx, (void*)data); },
+          nb::arg("type"), nb::arg("id"), nb::arg("kernel_idx"),
+          nb::arg("data"),
+          "Set custom data pointer for an integral. The data pointer is "
+          "passed to the kernel.")
+      .def(
+          "custom_data",
+          [](const dolfinx::fem::Form<T, U>& self,
+             dolfinx::fem::IntegralType type, int id, int kernel_idx)
+          { return (std::uintptr_t)self.custom_data(type, id, kernel_idx); },
+          nb::arg("type"), nb::arg("id"), nb::arg("kernel_idx"),
+          "Get custom data pointer for an integral.")
       .def(
           "domains",
           [](const dolfinx::fem::Form<T, U>& self,
diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
new file mode 100644
index 0000000000..fee5ad88cf
--- /dev/null
+++ b/python/test/unit/fem/test_custom_data.py
@@ -0,0 +1,287 @@
+"""Unit tests for custom_data functionality in assembly."""
+
+# Copyright (C) 2025 Susanne Claus
+#
+# This file is part of DOLFINx (https://www.fenicsproject.org)
+#
+# SPDX-License-Identifier:    LGPL-3.0-or-later
+
+from mpi4py import MPI
+
+import numpy as np
+import pytest
+
+import dolfinx
+import ffcx.codegeneration.utils
+from dolfinx import la
+from dolfinx.fem import Form, IntegralType, form_cpp_class, functionspace
+from dolfinx.mesh import create_unit_square
+
+numba = pytest.importorskip("numba")
+ufcx_signature = ffcx.codegeneration.utils.numba_ufcx_kernel_signature
+
+
+# Helper intrinsic to cast void* to a typed pointer for custom_data
+@numba.extending.intrinsic
+def voidptr_to_float64_ptr(typingctx, src):
+    """Cast a void pointer (CPointer(void)) to a float64 pointer.
+
+    This function is used to access custom_data passed through the UFCx
+    tabulate_tensor interface. Since custom_data is passed as void*, this
+    intrinsic allows casting it to a typed float64 pointer for element access.
+
+    Args:
+        typingctx: The typing context.
+        src: A void pointer (CPointer(void)) to cast.
+
+    Returns:
+        sig: A Numba signature returning CPointer(float64).
+        codegen: A code generation function that performs the bitcast.
+
+    Example:
+        Inside a Numba cfunc kernel::
+
+            typed_ptr = voidptr_to_float64_ptr(custom_data)
+            scale = typed_ptr[0]  # Access first float64 value
+    """
+    # Accept CPointer(void) which shows as 'none*' in numba type system
+    if isinstance(src, numba.types.CPointer) and src.dtype == numba.types.void:
+        sig = numba.types.CPointer(numba.types.float64)(src)
+
+        def codegen(context, builder, signature, args):
+            [src] = args
+            # Cast void* to float64*
+            dst_type = context.get_value_type(numba.types.CPointer(numba.types.float64))
+            return builder.bitcast(src, dst_type)
+
+        return sig, codegen
+
+
+def tabulate_rank1_with_custom_data(dtype, xdtype):
+    """Kernel that reads a scaling factor from custom_data.
+
+    Note: custom_data must be set to a valid pointer before assembly.
+    """
+
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate(b_, w_, c_, coords_, local_index, orientation, custom_data):
+        b = numba.carray(b_, (3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        # Cast void* to float64* and read the scale value
+        typed_ptr = voidptr_to_float64_ptr(custom_data)
+        scale = typed_ptr[0]
+
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        # 2x Element area Ae
+        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
+        b[:] = scale * Ae / 6.0
+
+    return tabulate
+
+
+def tabulate_rank2_with_custom_data(dtype, xdtype):
+    """Kernel that reads a scaling factor from custom_data for matrix assembly.
+
+    Note: custom_data must be set to a valid pointer before assembly.
+    """
+
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate(A_, w_, c_, coords_, entity_local_index, cell_orientation, custom_data):
+        A = numba.carray(A_, (3, 3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        # Cast void* to float64* and read the scale value
+        typed_ptr = voidptr_to_float64_ptr(custom_data)
+        scale = typed_ptr[0]
+
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        # 2x Element area Ae
+        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
+        B = np.array([y1 - y2, y2 - y0, y0 - y1, x2 - x1, x0 - x2, x1 - x0], dtype=dtype).reshape(
+            2, 3
+        )
+        A[:, :] = scale * np.dot(B.T, B) / (2 * Ae)
+
+    return tabulate
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_vector_assembly(dtype):
+    """Test that custom_data is correctly passed to kernels during vector assembly."""
+    xdtype = np.real(dtype(0)).dtype
+    k1 = tabulate_rank1_with_custom_data(dtype, xdtype)
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 13, 13, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+
+    tdim = mesh.topology.dim
+    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
+    cells = np.arange(num_cells, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+
+    integrals = {IntegralType.cell: [(0, k1.address, cells, active_coeffs)]}
+    formtype = form_cpp_class(dtype)
+    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
+
+    # Create custom_data with scale=1.0 first
+    scale_value = np.array([1.0], dtype=dtype)
+    scale_ptr = scale_value.ctypes.data
+    L._cpp_object.set_custom_data(IntegralType.cell, 0, 0, scale_ptr)
+
+    # Assemble with scale=1.0
+    b1 = dolfinx.fem.assemble_vector(L)
+    b1.scatter_reverse(la.InsertMode.add)
+    norm1 = la.norm(b1)
+
+    # Verify we can read back the custom_data pointer
+    assert L._cpp_object.custom_data(IntegralType.cell, 0, 0) == scale_ptr
+
+    # Update custom_data to scale=2.0
+    scale_value[0] = 2.0
+    b2 = dolfinx.fem.assemble_vector(L)
+    b2.scatter_reverse(la.InsertMode.add)
+    norm2 = la.norm(b2)
+
+    # The norm with scale=2 should be 2x the norm with scale=1
+    assert np.isclose(norm2, 2.0 * norm1)
+
+    # Test with scale=3.0
+    scale_value[0] = 3.0
+    b3 = dolfinx.fem.assemble_vector(L)
+    b3.scatter_reverse(la.InsertMode.add)
+    norm3 = la.norm(b3)
+
+    assert np.isclose(norm3, 3.0 * norm1)
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_matrix_assembly(dtype):
+    """Test that custom_data is correctly passed to kernels during matrix assembly."""
+    xdtype = np.real(dtype(0)).dtype
+    k2 = tabulate_rank2_with_custom_data(dtype, xdtype)
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 13, 13, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+
+    cells = np.arange(mesh.topology.index_map(mesh.topology.dim).size_local, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+
+    integrals = {IntegralType.cell: [(0, k2.address, cells, active_coeffs)]}
+    formtype = form_cpp_class(dtype)
+    a = Form(
+        formtype(
+            [V._cpp_object, V._cpp_object],
+            integrals,
+            [],
+            [],
+            False,
+            [],
+            mesh=mesh._cpp_object,
+        )
+    )
+
+    # Set custom_data with scale=1.0 first
+    scale_value = np.array([1.0], dtype=dtype)
+    a._cpp_object.set_custom_data(IntegralType.cell, 0, 0, scale_value.ctypes.data)
+
+    # Assemble with scale=1.0
+    A1 = dolfinx.fem.assemble_matrix(a)
+    A1.scatter_reverse()
+    norm1 = np.sqrt(A1.squared_norm())
+
+    # Update custom_data to scale=2.0
+    scale_value[0] = 2.0
+    A2 = dolfinx.fem.assemble_matrix(a)
+    A2.scatter_reverse()
+    norm2 = np.sqrt(A2.squared_norm())
+
+    # The norm with scale=2 should be 2x the norm with scale=1
+    assert np.isclose(norm2, 2.0 * norm1)
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_default_nullptr(dtype):
+    """Test that custom_data defaults to nullptr (0)."""
+    xdtype = np.real(dtype(0)).dtype
+
+    # Define a simple kernel that doesn't use custom_data
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate_simple(b_, w_, c_, coords_, local_index, orientation, custom_data):
+        b = numba.carray(b_, (3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
+        b[:] = Ae / 6.0
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 5, 5, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+
+    tdim = mesh.topology.dim
+    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
+    cells = np.arange(num_cells, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+
+    integrals = {IntegralType.cell: [(0, tabulate_simple.address, cells, active_coeffs)]}
+    formtype = form_cpp_class(dtype)
+    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
+
+    # custom_data should be 0 (nullptr) by default
+    assert L._cpp_object.custom_data(IntegralType.cell, 0, 0) == 0
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_struct(dtype):
+    """Test passing a struct with multiple values through custom_data."""
+    xdtype = np.real(dtype(0)).dtype
+
+    # Define a kernel that reads two values from custom_data
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate_with_struct(b_, w_, c_, coords_, local_index, orientation, custom_data):
+        b = numba.carray(b_, (3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        # Cast void* to float64* and read two values: [scale, offset]
+        typed_ptr = voidptr_to_float64_ptr(custom_data)
+        scale = typed_ptr[0]
+        offset = typed_ptr[1]
+
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
+        b[:] = scale * Ae / 6.0 + offset
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 5, 5, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+
+    tdim = mesh.topology.dim
+    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
+    cells = np.arange(num_cells, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+
+    integrals = {IntegralType.cell: [(0, tabulate_with_struct.address, cells, active_coeffs)]}
+    formtype = form_cpp_class(dtype)
+    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
+
+    # Create struct data: [scale=2.0, offset=0.5]
+    struct_data = np.array([2.0, 0.5], dtype=dtype)
+    L._cpp_object.set_custom_data(IntegralType.cell, 0, 0, struct_data.ctypes.data)
+
+    b = dolfinx.fem.assemble_vector(L)
+    b.scatter_reverse(la.InsertMode.add)
+
+    # Verify the assembly used our custom values
+    # The offset should contribute to each DOF
+    assert la.norm(b) > 0

From 81016854556dc90ffa73861b6ac89d47bee9fe0a Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Mon, 1 Dec 2025 22:31:43 +0100
Subject: [PATCH 02/17] Use std::optional<void*> for custom_data instead of raw
 pointer

Replace void* with nullptr pattern with std::optional<void*> for
custom_data in the Form class and assembly functions. This is the
canonical modern C++ approach for representing optional values.

Changes:
- Form.h: integral_data::custom_data now uses std::optional<void*>
- Form.h: custom_data() returns std::optional<void*>
- Form.h: set_custom_data() accepts std::optional<void*>
- assemble_*_impl.h: Function parameters use std::optional<void*>
- assemble_*_impl.h: Kernel calls use .value_or(nullptr)
- Python bindings: Handle std::optional with None support
- Test: Update to expect None instead of 0 for unset custom_data

Benefits:
- Clearer intent: "optional value" vs "magic nullptr"
- Type safety: Distinguishes "no value" from "null pointer value"
- Pythonic: Returns None instead of 0 when not set
- Zero-cost: .value_or(nullptr) compiles to same code
---
 cpp/dolfinx/fem/Form.h                   | 14 +++++----
 cpp/dolfinx/fem/assemble_matrix_impl.h   | 22 +++++++------
 cpp/dolfinx/fem/assemble_scalar_impl.h   | 22 +++++++------
 cpp/dolfinx/fem/assemble_vector_impl.h   | 40 +++++++++++++-----------
 python/dolfinx/wrappers/fem.cpp          | 23 +++++++++-----
 python/test/unit/fem/test_custom_data.py |  4 +--
 6 files changed, 74 insertions(+), 51 deletions(-)

diff --git a/cpp/dolfinx/fem/Form.h b/cpp/dolfinx/fem/Form.h
index c4519c6faf..6c71141b8a 100644
--- a/cpp/dolfinx/fem/Form.h
+++ b/cpp/dolfinx/fem/Form.h
@@ -67,7 +67,7 @@ struct integral_data
                  and std::is_convertible_v<std::remove_cvref_t<W>,
                                            std::vector<int>>
   integral_data(K&& kernel, V&& entities, W&& coeffs,
-                void* custom_data = nullptr)
+                std::optional<void*> custom_data = std::nullopt)
       : kernel(std::forward<K>(kernel)), entities(std::forward<V>(entities)),
         coeffs(std::forward<W>(coeffs)), custom_data(custom_data)
   {
@@ -89,7 +89,7 @@ struct integral_data
   /// @brief Custom user data pointer passed to the kernel function.
   /// This can be used to pass runtime-computed data (e.g., per-cell
   /// quadrature rules, material properties) to the kernel.
-  void* custom_data = nullptr;
+  std::optional<void*> custom_data = std::nullopt;
 };
 
 /// @brief A representation of finite element variational forms.
@@ -409,8 +409,9 @@ class Form
   /// @param[in] id Integral subdomain ID.
   /// @param[in] kernel_idx Index of the kernel (we may have multiple
   /// kernels for a given ID in mixed-topology meshes).
-  /// @return Custom data pointer for the integral, or nullptr if not set.
-  void* custom_data(IntegralType type, int id, int kernel_idx) const
+  /// @return Custom data pointer for the integral, or std::nullopt if not set.
+  std::optional<void*> custom_data(IntegralType type, int id,
+                                   int kernel_idx) const
   {
     auto it = _integrals.find({type, id, kernel_idx});
     if (it == _integrals.end())
@@ -423,8 +424,9 @@ class Form
   /// @param[in] type Integral type.
   /// @param[in] id Integral subdomain ID.
   /// @param[in] kernel_idx Index of the kernel.
-  /// @param[in] data Custom data pointer to set.
-  void set_custom_data(IntegralType type, int id, int kernel_idx, void* data)
+  /// @param[in] data Custom data pointer to set, or std::nullopt to clear.
+  void set_custom_data(IntegralType type, int id, int kernel_idx,
+                       std::optional<void*> data)
   {
     auto it = _integrals.find({type, id, kernel_idx});
     if (it == _integrals.end())
diff --git a/cpp/dolfinx/fem/assemble_matrix_impl.h b/cpp/dolfinx/fem/assemble_matrix_impl.h
index 99f3553a5b..5b68e157a8 100644
--- a/cpp/dolfinx/fem/assemble_matrix_impl.h
+++ b/cpp/dolfinx/fem/assemble_matrix_impl.h
@@ -18,6 +18,7 @@
 #include <dolfinx/mesh/Topology.h>
 #include <functional>
 #include <iterator>
+#include <optional>
 #include <span>
 #include <tuple>
 #include <vector>
@@ -75,7 +76,8 @@ void assemble_cells_matrix(
     std::span<const std::int8_t> bc1, FEkernel<T> auto kernel,
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     std::span<const T> constants, std::span<const std::uint32_t> cell_info0,
-    std::span<const std::uint32_t> cell_info1, void* custom_data = nullptr)
+    std::span<const std::uint32_t> cell_info1,
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (cells.empty())
     return;
@@ -110,7 +112,7 @@ void assemble_cells_matrix(
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(), nullptr,
-           nullptr, custom_data);
+           nullptr, custom_data.value_or(nullptr));
 
     // Compute A = P_0 \tilde{A} P_1^T (dof transformation)
     P0(Ae, cell_info0, cell0, ndim1);  // B = P0 \tilde{A}
@@ -224,7 +226,7 @@ void assemble_entities(
     std::span<const T> constants, std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    void* custom_data = nullptr)
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (entities.empty())
     return;
@@ -262,7 +264,7 @@ void assemble_entities(
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(f, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, custom_data);
+           &local_entity, &perm, custom_data.value_or(nullptr));
     P0(Ae, cell_info0, cell0, ndim1);
     P1T(Ae, cell_info1, cell1, ndim0);
 
@@ -367,7 +369,7 @@ void assemble_interior_facets(
     std::span<const T> constants, std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    void* custom_data = nullptr)
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (facets.empty())
     return;
@@ -444,7 +446,7 @@ void assemble_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     kernel(Ae.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), custom_data);
+           local_facet.data(), perm.data(), custom_data.value_or(nullptr));
 
     // Local element layout is a 2x2 block matrix with structure
     //
@@ -609,7 +611,8 @@ void assemble_matrix(
       std::span cells0 = a.domain_arg(IntegralType::cell, 0, i, cell_type_idx);
       std::span cells1 = a.domain_arg(IntegralType::cell, 1, i, cell_type_idx);
       auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
-      void* custom_data = a.custom_data(IntegralType::cell, i, cell_type_idx);
+      std::optional<void*> custom_data
+          = a.custom_data(IntegralType::cell, i, cell_type_idx);
       assert(cells.size() * cstride == coeffs.size());
       impl::assemble_cells_matrix(
           mat_set, x_dofmap, x, cells, {dofs0, bs0, cells0}, P0,
@@ -651,7 +654,8 @@ void assemble_matrix(
       assert(fn);
       auto& [coeffs, cstride]
           = coefficients.at({IntegralType::interior_facet, i});
-      void* custom_data = a.custom_data(IntegralType::interior_facet, i, 0);
+      std::optional<void*> custom_data
+          = a.custom_data(IntegralType::interior_facet, i, 0);
 
       std::span facets = a.domain(IntegralType::interior_facet, i, 0);
       std::span facets0 = a.domain_arg(IntegralType::interior_facet, 0, i, 0);
@@ -694,7 +698,7 @@ void assemble_matrix(
         auto fn = a.kernel(itg_type, i, 0);
         assert(fn);
         auto& [coeffs, cstride] = coefficients.at({itg_type, i});
-        void* custom_data = a.custom_data(itg_type, i, 0);
+        std::optional<void*> custom_data = a.custom_data(itg_type, i, 0);
 
         std::span e = a.domain(itg_type, i, 0);
         mdspanx2_t entities(e.data(), e.size() / 2, 2);
diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 88f5187b3f..016c3a9bd4 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -17,6 +17,7 @@
 #include <dolfinx/mesh/Mesh.h>
 #include <dolfinx/mesh/Topology.h>
 #include <memory>
+#include <optional>
 #include <vector>
 
 namespace dolfinx::fem::impl
@@ -31,7 +32,7 @@ T assemble_cells(mdspan2_t x_dofmap,
                  std::span<const T> constants,
                  md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
                  std::span<scalar_value_t<T>> cdofs_b,
-                 void* custom_data = nullptr)
+                 std::optional<void*> custom_data = std::nullopt)
 {
   T value(0);
   if (cells.empty())
@@ -50,7 +51,7 @@ T assemble_cells(mdspan2_t x_dofmap,
       std::copy_n(&x(x_dofs[i], 0), 3, std::next(cdofs_b.begin(), 3 * i));
 
     fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), nullptr,
-       nullptr, custom_data);
+       nullptr, custom_data.value_or(nullptr));
   }
 
   return value;
@@ -78,7 +79,8 @@ T assemble_entities(
     FEkernel<T> auto fn, std::span<const T> constants,
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    std::span<scalar_value_t<T>> cdofs_b, void* custom_data = nullptr)
+    std::span<scalar_value_t<T>> cdofs_b,
+    std::optional<void*> custom_data = std::nullopt)
 {
   T value(0);
   if (entities.empty())
@@ -100,7 +102,7 @@ T assemble_entities(
     // Permutations
     std::uint8_t perm = perms.empty() ? 0 : perms(cell, local_entity);
     fn(&value, &coeffs(f, 0), constants.data(), cdofs_b.data(), &local_entity,
-       &perm, custom_data);
+       &perm, custom_data.value_or(nullptr));
   }
 
   return value;
@@ -121,7 +123,8 @@ T assemble_interior_facets(
                                     md::dynamic_extent>>
         coeffs,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    std::span<scalar_value_t<T>> cdofs_b, void* custom_data = nullptr)
+    std::span<scalar_value_t<T>> cdofs_b,
+    std::optional<void*> custom_data = std::nullopt)
 {
   T value(0);
   if (facets.empty())
@@ -151,7 +154,7 @@ T assemble_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     fn(&value, &coeffs(f, 0, 0), constants.data(), cdofs_b.data(),
-       local_facet.data(), perm.data(), custom_data);
+       local_facet.data(), perm.data(), custom_data.value_or(nullptr));
   }
 
   return value;
@@ -179,7 +182,7 @@ T assemble_scalar(
     auto fn = M.kernel(IntegralType::cell, i, 0);
     assert(fn);
     auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
-    void* custom_data = M.custom_data(IntegralType::cell, i, 0);
+    std::optional<void*> custom_data = M.custom_data(IntegralType::cell, i, 0);
     std::span<const std::int32_t> cells = M.domain(IntegralType::cell, i, 0);
     assert(cells.size() * cstride == coeffs.size());
     value += impl::assemble_cells(
@@ -206,7 +209,8 @@ T assemble_scalar(
     assert(fn);
     auto& [coeffs, cstride]
         = coefficients.at({IntegralType::interior_facet, i});
-    void* custom_data = M.custom_data(IntegralType::interior_facet, i, 0);
+    std::optional<void*> custom_data
+        = M.custom_data(IntegralType::interior_facet, i, 0);
     std::span facets = M.domain(IntegralType::interior_facet, i, 0);
 
     constexpr std::size_t num_adjacent_cells = 2;
@@ -239,7 +243,7 @@ T assemble_scalar(
       auto fn = M.kernel(itg_type, i, 0);
       assert(fn);
       auto& [coeffs, cstride] = coefficients.at({itg_type, i});
-      void* custom_data = M.custom_data(itg_type, i, 0);
+      std::optional<void*> custom_data = M.custom_data(itg_type, i, 0);
 
       std::span entities = M.domain(itg_type, i, 0);
 
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index de03eade63..fdaa37471b 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -93,7 +93,7 @@ void _lift_bc_cells(
     std::span<const std::uint32_t> cell_info0,
     std::span<const std::uint32_t> cell_info1, std::span<const T> bc_values1,
     std::span<const std::int8_t> bc_markers1, std::span<const T> x0, T alpha,
-    void* custom_data = nullptr)
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (cells.empty())
     return;
@@ -165,7 +165,7 @@ void _lift_bc_cells(
 
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           nullptr, nullptr, custom_data);
+           nullptr, nullptr, custom_data.value_or(nullptr));
     P0(Ae, cell_info0, c0, num_cols);
     P1T(Ae, cell_info1, c1, num_rows);
 
@@ -288,7 +288,7 @@ void _lift_bc_entities(
     std::span<const std::uint32_t> cell_info1, std::span<const T> bc_values1,
     std::span<const std::int8_t> bc_markers1, std::span<const T> x0, T alpha,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    void* custom_data = nullptr)
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (entities.empty())
     return;
@@ -347,7 +347,7 @@ void _lift_bc_entities(
     std::uint8_t perm = perms.empty() ? 0 : perms(cell, local_entity);
     std::ranges::fill(Ae, 0);
     kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, custom_data);
+           &local_entity, &perm, custom_data.value_or(nullptr));
     P0(Ae, cell_info0, cell0, num_cols);
     P1T(Ae, cell_info1, cell1, num_rows);
 
@@ -446,7 +446,7 @@ void _lift_bc_interior_facets(
     std::span<const std::uint32_t> cell_info1, std::span<const T> bc_values1,
     std::span<const std::int8_t> bc_markers1, std::span<const T> x0, T alpha,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    void* custom_data = nullptr)
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (facets.empty())
     return;
@@ -562,7 +562,7 @@ void _lift_bc_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     kernel(Ae.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), custom_data);
+           local_facet.data(), perm.data(), custom_data.value_or(nullptr));
 
     if (cells0[0] >= 0)
       P0(Ae, cell_info0, cells0[0], num_cols);
@@ -674,7 +674,8 @@ void assemble_cells(
     std::tuple<mdspan2_t, int, std::span<const std::int32_t>> dofmap,
     FEkernel<T> auto kernel, std::span<const T> constants,
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
-    std::span<const std::uint32_t> cell_info0, void* custom_data = nullptr)
+    std::span<const std::uint32_t> cell_info0,
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (cells.empty())
     return;
@@ -701,7 +702,7 @@ void assemble_cells(
     // Tabulate vector for cell
     std::ranges::fill(be, 0);
     kernel(be.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           nullptr, nullptr, custom_data);
+           nullptr, nullptr, custom_data.value_or(nullptr));
     P0(be, cell_info0, c0, 1);
 
     // Scatter cell vector to 'global' vector array
@@ -773,7 +774,7 @@ void assemble_entities(
     md::mdspan<const T, md::dextents<std::size_t, 2>> coeffs,
     std::span<const std::uint32_t> cell_info0,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    void* custom_data = nullptr)
+    std::optional<void*> custom_data = std::nullopt)
 {
   if (entities.empty())
     return;
@@ -805,7 +806,7 @@ void assemble_entities(
     // Tabulate element vector
     std::ranges::fill(be, 0);
     kernel(be.data(), &coeffs(f, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, custom_data);
+           &local_entity, &perm, custom_data.value_or(nullptr));
     P0(be, cell_info0, cell0, 1);
 
     // Add element vector to global vector
@@ -871,7 +872,7 @@ void assemble_interior_facets(
         coeffs,
     std::span<const std::uint32_t> cell_info0,
     md::mdspan<const std::uint8_t, md::dextents<std::size_t, 2>> perms,
-    void* custom_data = nullptr)
+    std::optional<void*> custom_data = std::nullopt)
 {
   using X = scalar_value_t<T>;
 
@@ -923,7 +924,7 @@ void assemble_interior_facets(
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
     kernel(be.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), custom_data);
+           local_facet.data(), perm.data(), custom_data.value_or(nullptr));
 
     if (cells0[0] >= 0)
       P0(be, cell_info0, cells0[0], 1);
@@ -1031,7 +1032,7 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
     auto kernel = a.kernel(IntegralType::cell, i, 0);
     assert(kernel);
     auto& [_coeffs, cstride] = coefficients.at({IntegralType::cell, i});
-    void* custom_data = a.custom_data(IntegralType::cell, i, 0);
+    std::optional<void*> custom_data = a.custom_data(IntegralType::cell, i, 0);
     std::span cells = a.domain(IntegralType::cell, i, 0);
     std::span cells0 = a.domain_arg(IntegralType::cell, 0, i, 0);
     std::span cells1 = a.domain_arg(IntegralType::cell, 1, i, 0);
@@ -1079,7 +1080,8 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
     assert(kernel);
     auto& [coeffs, cstride]
         = coefficients.at({IntegralType::interior_facet, i});
-    void* custom_data = a.custom_data(IntegralType::interior_facet, i, 0);
+    std::optional<void*> custom_data
+        = a.custom_data(IntegralType::interior_facet, i, 0);
 
     using mdspanx22_t
         = md::mdspan<const std::int32_t,
@@ -1114,7 +1116,7 @@ void lift_bc(V&& b, const Form<T, U>& a, mdspan2_t x_dofmap,
       auto kernel = a.kernel(itg_type, i, 0);
       assert(kernel);
       auto& [coeffs, cstride] = coefficients.at({itg_type, i});
-      void* custom_data = a.custom_data(itg_type, i, 0);
+      std::optional<void*> custom_data = a.custom_data(itg_type, i, 0);
 
       using mdspanx2_t
           = md::mdspan<const std::int32_t,
@@ -1286,7 +1288,8 @@ void assemble_vector(
       std::span cells = L.domain(IntegralType::cell, i, cell_type_idx);
       std::span cells0 = L.domain_arg(IntegralType::cell, 0, i, cell_type_idx);
       auto& [coeffs, cstride] = coefficients.at({IntegralType::cell, i});
-      void* custom_data = L.custom_data(IntegralType::cell, i, cell_type_idx);
+      void* custom_data = L.custom_data(IntegralType::cell, i, cell_type_idx)
+                              .value_or(nullptr);
       assert(cells.size() * cstride == coeffs.size());
       if (bs == 1)
       {
@@ -1341,7 +1344,8 @@ void assemble_vector(
       assert(fn);
       auto& [coeffs, cstride]
           = coefficients.at({IntegralType::interior_facet, i});
-      void* custom_data = L.custom_data(IntegralType::interior_facet, i, 0);
+      void* custom_data
+          = L.custom_data(IntegralType::interior_facet, i, 0).value_or(nullptr);
       std::span facets = L.domain(IntegralType::interior_facet, i, 0);
       std::span facets1 = L.domain_arg(IntegralType::interior_facet, 0, i, 0);
       assert((facets.size() / 4) * 2 * cstride == coeffs.size());
@@ -1393,7 +1397,7 @@ void assemble_vector(
         auto fn = L.kernel(itg_type, i, 0);
         assert(fn);
         auto& [coeffs, cstride] = coefficients.at({itg_type, i});
-        void* custom_data = L.custom_data(itg_type, i, 0);
+        void* custom_data = L.custom_data(itg_type, i, 0).value_or(nullptr);
         std::span e = L.domain(itg_type, i, 0);
         mdspanx2_t entities(e.data(), e.size() / 2, 2);
         std::span e1 = L.domain_arg(itg_type, 0, i, 0);
diff --git a/python/dolfinx/wrappers/fem.cpp b/python/dolfinx/wrappers/fem.cpp
index 52ec70091d..738fab4540 100644
--- a/python/dolfinx/wrappers/fem.cpp
+++ b/python/dolfinx/wrappers/fem.cpp
@@ -764,19 +764,28 @@ void declare_form(nb::module_& m, std::string type)
       .def(
           "set_custom_data",
           [](dolfinx::fem::Form<T, U>& self, dolfinx::fem::IntegralType type,
-             int id, int kernel_idx, std::uintptr_t data)
-          { self.set_custom_data(type, id, kernel_idx, (void*)data); },
+             int id, int kernel_idx, std::optional<std::uintptr_t> data)
+          {
+            self.set_custom_data(type, id, kernel_idx,
+                                 data ? std::optional<void*>((void*)*data)
+                                      : std::nullopt);
+          },
           nb::arg("type"), nb::arg("id"), nb::arg("kernel_idx"),
-          nb::arg("data"),
+          nb::arg("data").none(),
           "Set custom data pointer for an integral. The data pointer is "
-          "passed to the kernel.")
+          "passed to the kernel. Pass None to clear.")
       .def(
           "custom_data",
           [](const dolfinx::fem::Form<T, U>& self,
-             dolfinx::fem::IntegralType type, int id, int kernel_idx)
-          { return (std::uintptr_t)self.custom_data(type, id, kernel_idx); },
+             dolfinx::fem::IntegralType type, int id,
+             int kernel_idx) -> std::optional<std::uintptr_t>
+          {
+            auto cd = self.custom_data(type, id, kernel_idx);
+            return cd ? std::optional<std::uintptr_t>((std::uintptr_t)*cd)
+                      : std::nullopt;
+          },
           nb::arg("type"), nb::arg("id"), nb::arg("kernel_idx"),
-          "Get custom data pointer for an integral.")
+          "Get custom data pointer for an integral, or None if not set.")
       .def(
           "domains",
           [](const dolfinx::fem::Form<T, U>& self,
diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index fee5ad88cf..013b545846 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -236,8 +236,8 @@ def tabulate_simple(b_, w_, c_, coords_, local_index, orientation, custom_data):
     formtype = form_cpp_class(dtype)
     L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
-    # custom_data should be 0 (nullptr) by default
-    assert L._cpp_object.custom_data(IntegralType.cell, 0, 0) == 0
+    # custom_data should be None (std::nullopt) by default
+    assert L._cpp_object.custom_data(IntegralType.cell, 0, 0) is None
 
 
 @pytest.mark.parametrize("dtype", [np.float64])

From 02d2cfd7a8a8d7617034b871e8ac9b01a95c3959 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Wed, 3 Dec 2025 19:06:50 +0100
Subject: [PATCH 03/17] Pass custom_data at Form creation time (immutable Form
 pattern)

Address review comments to maintain Form immutability:
- Remove set_custom_data method from Form class
- Pass custom_data as 5th element of integrals tuple at Form creation
- Integrals tuple format: (id, kernel_ptr, entities, coeffs, custom_data)
- custom_data can be None (maps to std::nullopt) or a pointer (uintptr_t)

Pass cell index through entity_local_index for cell integrals:
- Cell integrals now receive &cell instead of nullptr for entity_local_index
- Enables per-cell data lookup in custom kernels via custom_data
- FFCx-generated kernels don't read entity_local_index for cells (return 0)

Safety documentation:
- Add warning about void pointer safety to Form.h and Python bindings
- Document that users must keep data alive while Form is in use

Update tests:
- Update test_custom_jit_kernels.py to use 5-element tuples
- Expand test_custom_data.py with per-cell material property example
---
 cpp/dolfinx/fem/Form.h                        |  19 +-
 cpp/dolfinx/fem/assemble_matrix_impl.h        |   2 +-
 cpp/dolfinx/fem/assemble_scalar_impl.h        |   4 +-
 cpp/dolfinx/fem/assemble_vector_impl.h        |   8 +-
 python/dolfinx/wrappers/fem.cpp               |  44 ++-
 python/test/unit/fem/test_custom_data.py      | 309 ++++++++++++++++--
 .../test/unit/fem/test_custom_jit_kernels.py  |  16 +-
 7 files changed, 330 insertions(+), 72 deletions(-)

diff --git a/cpp/dolfinx/fem/Form.h b/cpp/dolfinx/fem/Form.h
index 6c71141b8a..0f0a0e78a6 100644
--- a/cpp/dolfinx/fem/Form.h
+++ b/cpp/dolfinx/fem/Form.h
@@ -405,6 +405,10 @@ class Form
   /// assembly. This can be used to pass runtime-computed data to
   /// kernels (e.g., per-cell quadrature rules, material properties).
   ///
+  /// @warning Void pointers are inherently unsafe and cannot be type or
+  /// bounds checked. Incorrect usage of this feature can and will lead
+  /// to undefined behaviour and crashes.
+  ///
   /// @param[in] type Integral type.
   /// @param[in] id Integral subdomain ID.
   /// @param[in] kernel_idx Index of the kernel (we may have multiple
@@ -419,21 +423,6 @@ class Form
     return it->second.custom_data;
   }
 
-  /// @brief Set the custom data pointer for an integral.
-  ///
-  /// @param[in] type Integral type.
-  /// @param[in] id Integral subdomain ID.
-  /// @param[in] kernel_idx Index of the kernel.
-  /// @param[in] data Custom data pointer to set, or std::nullopt to clear.
-  void set_custom_data(IntegralType type, int id, int kernel_idx,
-                       std::optional<void*> data)
-  {
-    auto it = _integrals.find({type, id, kernel_idx});
-    if (it == _integrals.end())
-      throw std::runtime_error("Requested integral not found.");
-    it->second.custom_data = data;
-  }
-
   /// @brief Get types of integrals in the form.
   /// @return Integrals types.
   std::set<IntegralType> integral_types() const
diff --git a/cpp/dolfinx/fem/assemble_matrix_impl.h b/cpp/dolfinx/fem/assemble_matrix_impl.h
index 5b68e157a8..0737a2463a 100644
--- a/cpp/dolfinx/fem/assemble_matrix_impl.h
+++ b/cpp/dolfinx/fem/assemble_matrix_impl.h
@@ -111,7 +111,7 @@ void assemble_cells_matrix(
 
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
-    kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(), nullptr,
+    kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(), &cell,
            nullptr, custom_data.value_or(nullptr));
 
     // Compute A = P_0 \tilde{A} P_1^T (dof transformation)
diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 016c3a9bd4..4b305f8f4a 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -50,8 +50,8 @@ T assemble_cells(mdspan2_t x_dofmap,
     for (std::size_t i = 0; i < x_dofs.size(); ++i)
       std::copy_n(&x(x_dofs[i], 0), 3, std::next(cdofs_b.begin(), 3 * i));
 
-    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), nullptr,
-       nullptr, custom_data.value_or(nullptr));
+    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), &c, nullptr,
+       custom_data.value_or(nullptr));
   }
 
   return value;
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index fdaa37471b..5e71b2b469 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -164,8 +164,8 @@ void _lift_bc_cells(
     auto dofs0 = md::submdspan(dmap0, c0, md::full_extent);
 
     std::ranges::fill(Ae, 0);
-    kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           nullptr, nullptr, custom_data.value_or(nullptr));
+    kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(), &c,
+           nullptr, custom_data.value_or(nullptr));
     P0(Ae, cell_info0, c0, num_cols);
     P1T(Ae, cell_info1, c1, num_rows);
 
@@ -701,8 +701,8 @@ void assemble_cells(
 
     // Tabulate vector for cell
     std::ranges::fill(be, 0);
-    kernel(be.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
-           nullptr, nullptr, custom_data.value_or(nullptr));
+    kernel(be.data(), &coeffs(index, 0), constants.data(), cdofs.data(), &c,
+           nullptr, custom_data.value_or(nullptr));
     P0(be, cell_info0, c0, 1);
 
     // Scatter cell vector to 'global' vector array
diff --git a/python/dolfinx/wrappers/fem.cpp b/python/dolfinx/wrappers/fem.cpp
index 738fab4540..06cbe5ac36 100644
--- a/python/dolfinx/wrappers/fem.cpp
+++ b/python/dolfinx/wrappers/fem.cpp
@@ -667,8 +667,8 @@ void declare_form(nb::module_& m, std::string type)
                  std::vector<std::tuple<
                      int, std::uintptr_t,
                      nb::ndarray<const std::int32_t, nb::ndim<1>, nb::c_contig>,
-                     nb::ndarray<const std::int8_t, nb::ndim<1>,
-                                 nb::c_contig>>>>& integrals,
+                     nb::ndarray<const std::int8_t, nb::ndim<1>, nb::c_contig>,
+                     std::optional<std::uintptr_t>>>>& integrals,
              const std::vector<std::shared_ptr<
                  const dolfinx::fem::Function<T, U>>>& coefficients,
              const std::vector<
@@ -684,17 +684,22 @@ void declare_form(nb::module_& m, std::string type)
             // Loop over kernel for each entity type
             for (auto& [type, kernels] : integrals)
             {
-              for (auto& [id, ptr, e, c] : kernels)
+              for (auto& [id, ptr, e, c, custom_data] : kernels)
               {
                 auto kn_ptr
                     = (void (*)(T*, const T*, const T*,
                                 const typename geom_type<T>::value_type*,
                                 const int*, const std::uint8_t*, void*))ptr;
+                // Convert custom_data from std::optional<uintptr_t> to
+                // std::optional<void*>
+                std::optional<void*> cd = std::nullopt;
+                if (custom_data.has_value())
+                  cd = reinterpret_cast<void*>(custom_data.value());
                 _integrals.insert(
                     {{type, id, 0},
                      {kn_ptr,
                       std::vector<std::int32_t>(e.data(), e.data() + e.size()),
-                      std::vector<int>(c.data(), c.data() + c.size())}});
+                      std::vector<int>(c.data(), c.data() + c.size()), cd}});
               }
             }
 
@@ -761,31 +766,34 @@ void declare_form(nb::module_& m, std::string type)
       .def_prop_ro("integral_types", &dolfinx::fem::Form<T, U>::integral_types)
       .def_prop_ro("needs_facet_permutations",
                    &dolfinx::fem::Form<T, U>::needs_facet_permutations)
-      .def(
-          "set_custom_data",
-          [](dolfinx::fem::Form<T, U>& self, dolfinx::fem::IntegralType type,
-             int id, int kernel_idx, std::optional<std::uintptr_t> data)
-          {
-            self.set_custom_data(type, id, kernel_idx,
-                                 data ? std::optional<void*>((void*)*data)
-                                      : std::nullopt);
-          },
-          nb::arg("type"), nb::arg("id"), nb::arg("kernel_idx"),
-          nb::arg("data").none(),
-          "Set custom data pointer for an integral. The data pointer is "
-          "passed to the kernel. Pass None to clear.")
       .def(
           "custom_data",
           [](const dolfinx::fem::Form<T, U>& self,
              dolfinx::fem::IntegralType type, int id,
              int kernel_idx) -> std::optional<std::uintptr_t>
           {
+            /* We use std::uintptr_t to map void* into Python to allow
+            seamless interop with numpy.ctypes.data.
+            Example:
+            # Create data to pass to kernel:
+            data = np.array([2.0], dtype=np.float64)
+            # Pass pointer at Form creation via integrals tuple:
+            integrals = {IntegralType.cell: [(id, kernel_ptr, cells,
+            coeffs, data.ctypes.data)]}
+            formtype = form_cpp_class(dtype)
+            L = Form(formtype([V._cpp_object], integrals, [], [], False, [],
+            mesh=mesh._cpp_object)) Important: You must keep the NumPy array
+            alive in Python while the form is being used; if the array is
+            garbage collected, the pointer becomes invalid! */
             auto cd = self.custom_data(type, id, kernel_idx);
             return cd ? std::optional<std::uintptr_t>((std::uintptr_t)*cd)
                       : std::nullopt;
           },
           nb::arg("type"), nb::arg("id"), nb::arg("kernel_idx"),
-          "Get custom data pointer for an integral, or None if not set.")
+          "Get custom data pointer for an integral, or None if not set. "
+          "Warning: void-pointers are inherently unsafe and cannot be type "
+          "or bounds checked. Incorrect usage of this feature can and will "
+          "lead to undefined behaviour and crashes.")
       .def(
           "domains",
           [](const dolfinx::fem::Form<T, U>& self,
diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index 013b545846..bfe0f15579 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -126,14 +126,14 @@ def test_custom_data_vector_assembly(dtype):
     cells = np.arange(num_cells, dtype=np.int32)
     active_coeffs = np.array([], dtype=np.int8)
 
-    integrals = {IntegralType.cell: [(0, k1.address, cells, active_coeffs)]}
-    formtype = form_cpp_class(dtype)
-    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
-
-    # Create custom_data with scale=1.0 first
+    # Create custom_data with scale=1.0
     scale_value = np.array([1.0], dtype=dtype)
     scale_ptr = scale_value.ctypes.data
-    L._cpp_object.set_custom_data(IntegralType.cell, 0, 0, scale_ptr)
+
+    # Pass custom_data at form creation time via the 5th element of the integrals tuple
+    integrals = {IntegralType.cell: [(0, k1.address, cells, active_coeffs, scale_ptr)]}
+    formtype = form_cpp_class(dtype)
+    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
     # Assemble with scale=1.0
     b1 = dolfinx.fem.assemble_vector(L)
@@ -143,7 +143,7 @@ def test_custom_data_vector_assembly(dtype):
     # Verify we can read back the custom_data pointer
     assert L._cpp_object.custom_data(IntegralType.cell, 0, 0) == scale_ptr
 
-    # Update custom_data to scale=2.0
+    # Update custom_data to scale=2.0 (by modifying the underlying array)
     scale_value[0] = 2.0
     b2 = dolfinx.fem.assemble_vector(L)
     b2.scatter_reverse(la.InsertMode.add)
@@ -173,7 +173,13 @@ def test_custom_data_matrix_assembly(dtype):
     cells = np.arange(mesh.topology.index_map(mesh.topology.dim).size_local, dtype=np.int32)
     active_coeffs = np.array([], dtype=np.int8)
 
-    integrals = {IntegralType.cell: [(0, k2.address, cells, active_coeffs)]}
+    # Create custom_data with scale=1.0
+    scale_value = np.array([1.0], dtype=dtype)
+
+    # Pass custom_data at form creation time via the 5th element
+    integrals = {
+        IntegralType.cell: [(0, k2.address, cells, active_coeffs, scale_value.ctypes.data)]
+    }
     formtype = form_cpp_class(dtype)
     a = Form(
         formtype(
@@ -187,16 +193,12 @@ def test_custom_data_matrix_assembly(dtype):
         )
     )
 
-    # Set custom_data with scale=1.0 first
-    scale_value = np.array([1.0], dtype=dtype)
-    a._cpp_object.set_custom_data(IntegralType.cell, 0, 0, scale_value.ctypes.data)
-
     # Assemble with scale=1.0
     A1 = dolfinx.fem.assemble_matrix(a)
     A1.scatter_reverse()
     norm1 = np.sqrt(A1.squared_norm())
 
-    # Update custom_data to scale=2.0
+    # Update custom_data to scale=2.0 (by modifying the underlying array)
     scale_value[0] = 2.0
     A2 = dolfinx.fem.assemble_matrix(a)
     A2.scatter_reverse()
@@ -208,7 +210,7 @@ def test_custom_data_matrix_assembly(dtype):
 
 @pytest.mark.parametrize("dtype", [np.float64])
 def test_custom_data_default_nullptr(dtype):
-    """Test that custom_data defaults to nullptr (0)."""
+    """Test that custom_data defaults to nullptr (None) when not provided."""
     xdtype = np.real(dtype(0)).dtype
 
     # Define a simple kernel that doesn't use custom_data
@@ -232,11 +234,12 @@ def tabulate_simple(b_, w_, c_, coords_, local_index, orientation, custom_data):
     cells = np.arange(num_cells, dtype=np.int32)
     active_coeffs = np.array([], dtype=np.int8)
 
-    integrals = {IntegralType.cell: [(0, tabulate_simple.address, cells, active_coeffs)]}
+    # Pass None as custom_data (5th element) to get std::nullopt
+    integrals = {IntegralType.cell: [(0, tabulate_simple.address, cells, active_coeffs, None)]}
     formtype = form_cpp_class(dtype)
     L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
-    # custom_data should be None (std::nullopt) by default
+    # custom_data should be None (std::nullopt) when passed as None
     assert L._cpp_object.custom_data(IntegralType.cell, 0, 0) is None
 
 
@@ -271,17 +274,273 @@ def tabulate_with_struct(b_, w_, c_, coords_, local_index, orientation, custom_d
     cells = np.arange(num_cells, dtype=np.int32)
     active_coeffs = np.array([], dtype=np.int8)
 
-    integrals = {IntegralType.cell: [(0, tabulate_with_struct.address, cells, active_coeffs)]}
+    # Test 1: scale=1.0, offset=0.0 (baseline)
+    struct_data = np.array([1.0, 0.0], dtype=dtype)
+
+    # Pass custom_data at form creation time
+    integrals = {
+        IntegralType.cell: [
+            (0, tabulate_with_struct.address, cells, active_coeffs, struct_data.ctypes.data)
+        ]
+    }
+    formtype = form_cpp_class(dtype)
+    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
+
+    b_baseline = dolfinx.fem.assemble_vector(L)
+    b_baseline.scatter_reverse(la.InsertMode.add)
+    norm_baseline = la.norm(b_baseline)
+
+    # Test 2: scale=2.0, offset=0.0 - should double the norm
+    struct_data[0] = 2.0
+    struct_data[1] = 0.0
+    b_scaled = dolfinx.fem.assemble_vector(L)
+    b_scaled.scatter_reverse(la.InsertMode.add)
+    norm_scaled = la.norm(b_scaled)
+    assert np.isclose(norm_scaled, 2.0 * norm_baseline)
+
+    # Test 3: scale=0.0, offset=1.0 - pure offset contribution
+    struct_data[0] = 0.0
+    struct_data[1] = 1.0
+    b_offset = dolfinx.fem.assemble_vector(L)
+    b_offset.scatter_reverse(la.InsertMode.add)
+    # With offset=1.0, each DOF gets contribution from each cell it touches
+    # Interior nodes touch 6 cells, edge nodes touch 3-4, corner nodes touch 1-2
+    # The sum of all contributions equals 3 * num_cells (3 DOFs per cell, offset=1.0 each)
+    total_contribution = np.sum(b_offset.array)
+    assert np.isclose(total_contribution, 3.0 * num_cells)
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_multiple_parameters(dtype):
+    """Test custom_data with multiple parameters demonstrating complex data passing.
+
+    This test shows how to pass multiple values through custom_data, simulating
+    the use case of passing runtime-computed parameters like material properties
+    or integration parameters.
+    """
+    xdtype = np.real(dtype(0)).dtype
+
+    # Kernel that uses three parameters: coefficient, exponent base, and additive term
+    # Computes: coeff * base^area + additive
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate_with_params(b_, w_, c_, coords_, local_index, orientation, custom_data):
+        b = numba.carray(b_, (3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        # Read three parameters from custom_data
+        typed_ptr = voidptr_to_float64_ptr(custom_data)
+        coeff = typed_ptr[0]
+        power = typed_ptr[1]
+        additive = typed_ptr[2]
+
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
+        # Use power as a simple multiplier (avoiding actual power function complexity)
+        val = coeff * power * Ae / 6.0 + additive
+        b[:] = val
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 4, 4, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+
+    tdim = mesh.topology.dim
+    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
+    cells = np.arange(num_cells, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+
+    # Test with specific parameters: coeff=2, power=3, additive=0
+    params = np.array([2.0, 3.0, 0.0], dtype=dtype)
+
+    # Pass custom_data at form creation time
+    integrals = {
+        IntegralType.cell: [
+            (0, tabulate_with_params.address, cells, active_coeffs, params.ctypes.data)
+        ]
+    }
+    formtype = form_cpp_class(dtype)
+    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
+
+    b1 = dolfinx.fem.assemble_vector(L)
+    b1.scatter_reverse(la.InsertMode.add)
+    norm1 = la.norm(b1)
+
+    # Change parameters: coeff=1, power=6, additive=0 (should give same result: 1*6 = 2*3)
+    params[0] = 1.0
+    params[1] = 6.0
+    b2 = dolfinx.fem.assemble_vector(L)
+    b2.scatter_reverse(la.InsertMode.add)
+    norm2 = la.norm(b2)
+
+    assert np.isclose(norm1, norm2)
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_global_parameter_update(dtype):
+    """Test updating custom_data between assemblies for parameter studies.
+
+    This demonstrates a key use case: running multiple assemblies with
+    different parameter values without recreating the form. The custom_data
+    points to a parameter that can be modified between assembly calls.
+    """
+    xdtype = np.real(dtype(0)).dtype
+
+    # Kernel that reads a diffusion coefficient from custom_data
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate_diffusion(b_, w_, c_, coords_, local_index, orientation, custom_data):
+        b = numba.carray(b_, (3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        # Read diffusion coefficient from custom_data
+        typed_ptr = voidptr_to_float64_ptr(custom_data)
+        kappa = typed_ptr[0]  # Diffusion coefficient
+
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        # 2x Element area Ae
+        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
+        # Simple load vector scaled by diffusion coefficient
+        b[:] = kappa * Ae / 6.0
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 8, 8, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+
+    tdim = mesh.topology.dim
+    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
+    cells = np.arange(num_cells, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+
+    # Parameter array - this can be updated between assemblies
+    kappa = np.array([1.0], dtype=dtype)
+
+    integrals = {
+        IntegralType.cell: [
+            (0, tabulate_diffusion.address, cells, active_coeffs, kappa.ctypes.data)
+        ]
+    }
     formtype = form_cpp_class(dtype)
     L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
-    # Create struct data: [scale=2.0, offset=0.5]
-    struct_data = np.array([2.0, 0.5], dtype=dtype)
-    L._cpp_object.set_custom_data(IntegralType.cell, 0, 0, struct_data.ctypes.data)
+    # Store results for different kappa values
+    results = []
+    kappa_values = [0.1, 1.0, 10.0, 100.0]
+
+    for k in kappa_values:
+        kappa[0] = k
+        b = dolfinx.fem.assemble_vector(L)
+        b.scatter_reverse(la.InsertMode.add)
+        results.append(la.norm(b))
+
+    # Verify linear scaling: norm should scale linearly with kappa
+    # norm(kappa=k) / norm(kappa=1) should equal k
+    base_norm = results[1]  # kappa=1.0
+    for i, k in enumerate(kappa_values):
+        expected_ratio = k / 1.0
+        actual_ratio = results[i] / base_norm
+        assert np.isclose(actual_ratio, expected_ratio, rtol=1e-10)
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_per_cell_material(dtype):
+    """Test custom_data with per-cell material properties using cell index.
+
+    This test demonstrates the use case where a kernel needs to access
+    cell-specific data. The cell index is now passed through entity_local_index
+    for cell integrals, allowing the kernel to look up per-cell values.
+
+    The custom_data pointer points to an array of values indexed by cell number.
+    The kernel uses entity_local_index[0] (which contains the cell index for
+    cell integrals) to look up the appropriate value.
+    """
+    xdtype = np.real(dtype(0)).dtype
+
+    # Kernel that reads per-cell material property from custom_data
+    # custom_data points to array: material_values[cell_index]
+    # entity_local_index[0] contains the cell index for cell integrals
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate_per_cell_material(
+        b_, w_, c_, coords_, entity_local_index, cell_orientation, custom_data
+    ):
+        b = numba.carray(b_, (3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        # Cast void* to float64* - this points to per-cell material array
+        material_array = voidptr_to_float64_ptr(custom_data)
+
+        # entity_local_index[0] contains the cell index for cell integrals
+        cell_idx_ptr = numba.carray(entity_local_index, (1,), dtype=np.int32)
+        cell_idx = cell_idx_ptr[0]
+
+        # Look up the material property for this specific cell
+        material_value = material_array[cell_idx]
+
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        # 2x Element area Ae
+        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
+        b[:] = material_value * Ae / 6.0
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 4, 4, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
 
-    b = dolfinx.fem.assemble_vector(L)
-    b.scatter_reverse(la.InsertMode.add)
+    tdim = mesh.topology.dim
+    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
+    cells = np.arange(num_cells, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+
+    # Create per-cell material array - all cells have material=1.0
+    material_values = np.ones(num_cells, dtype=dtype)
+
+    # Pass pointer to material array as custom_data
+    integrals = {
+        IntegralType.cell: [
+            (
+                0,
+                tabulate_per_cell_material.address,
+                cells,
+                active_coeffs,
+                material_values.ctypes.data,
+            )
+        ]
+    }
+    formtype = form_cpp_class(dtype)
+    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
-    # Verify the assembly used our custom values
-    # The offset should contribute to each DOF
-    assert la.norm(b) > 0
+    # Assemble with uniform material=1.0
+    b_uniform = dolfinx.fem.assemble_vector(L)
+    b_uniform.scatter_reverse(la.InsertMode.add)
+    norm_uniform = la.norm(b_uniform)
+
+    # Now set material=2.0 for all cells - should double the result
+    material_values[:] = 2.0
+    b_doubled = dolfinx.fem.assemble_vector(L)
+    b_doubled.scatter_reverse(la.InsertMode.add)
+    norm_doubled = la.norm(b_doubled)
+
+    assert np.isclose(norm_doubled, 2.0 * norm_uniform)
+
+    # Test heterogeneous material: first half of cells have material=1.0,
+    # second half have material=3.0
+    material_values[: num_cells // 2] = 1.0
+    material_values[num_cells // 2 :] = 3.0
+    b_hetero = dolfinx.fem.assemble_vector(L)
+    b_hetero.scatter_reverse(la.InsertMode.add)
+
+    # The result should be between uniform material=1.0 and material=3.0
+    norm_hetero = la.norm(b_hetero)
+    assert norm_uniform < norm_hetero < 3.0 * norm_uniform
+
+    # Verify the total contribution matches expected:
+    # The kernel computes Ae = 2*area (determinant formula), so:
+    # Each cell contributes material[i] * Ae / 6 = material[i] * area / 3 to each of 3 DOFs
+    # Total per cell = 3 * material[i] * area / 3 = material[i] * area
+    # Total = sum_i (material[i] * area_i)
+    # For uniform mesh on unit square with 4x4 grid: 32 triangles, each area 1/32
+    total_contribution = np.sum(b_hetero.array)
+    expected_sum = np.sum(material_values) * (1.0 / num_cells)
+    assert np.isclose(total_contribution, expected_sum)
diff --git a/python/test/unit/fem/test_custom_jit_kernels.py b/python/test/unit/fem/test_custom_jit_kernels.py
index 242e867469..3f798f66c1 100644
--- a/python/test/unit/fem/test_custom_jit_kernels.py
+++ b/python/test/unit/fem/test_custom_jit_kernels.py
@@ -93,9 +93,9 @@ def test_numba_assembly(dtype):
     active_coeffs = np.array([], dtype=np.int8)
     integrals = {
         IntegralType.cell: [
-            (0, k2.address, cells, active_coeffs),
-            (1, k2.address, np.arange(0), active_coeffs),
-            (2, k2.address, np.arange(0), active_coeffs),
+            (0, k2.address, cells, active_coeffs, None),
+            (1, k2.address, np.arange(0), active_coeffs, None),
+            (2, k2.address, np.arange(0), active_coeffs, None),
         ]
     }
     formtype = form_cpp_class(dtype)
@@ -104,7 +104,7 @@ def test_numba_assembly(dtype):
             [V._cpp_object, V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object
         )
     )
-    integrals = {IntegralType.cell: [(0, k1.address, cells, active_coeffs)]}
+    integrals = {IntegralType.cell: [(0, k1.address, cells, active_coeffs, None)]}
     L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
     A = dolfinx.fem.assemble_matrix(a)
@@ -135,7 +135,9 @@ def test_coefficient(dtype):
     num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
     active_coeffs = np.array([0], dtype=np.int8)
     integrals = {
-        IntegralType.cell: [(0, k1.address, np.arange(num_cells, dtype=np.int32), active_coeffs)]
+        IntegralType.cell: [
+            (0, k1.address, np.arange(num_cells, dtype=np.int32), active_coeffs, None)
+        ]
     }
     formtype = form_cpp_class(dtype)
     L = Form(
@@ -275,7 +277,7 @@ def test_cffi_assembly():
 
     ptrA = ffi.cast("intptr_t", ffi.addressof(lib, "tabulate_tensor_poissonA"))
     active_coeffs = np.array([], dtype=np.int8)
-    integrals = {IntegralType.cell: [(0, ptrA, cells, active_coeffs)]}
+    integrals = {IntegralType.cell: [(0, ptrA, cells, active_coeffs, None)]}
     a = Form(
         _cpp.fem.Form_float64(
             [V._cpp_object, V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object
@@ -283,7 +285,7 @@ def test_cffi_assembly():
     )
 
     ptrL = ffi.cast("intptr_t", ffi.addressof(lib, "tabulate_tensor_poissonL"))
-    integrals = {IntegralType.cell: [(0, ptrL, cells, active_coeffs)]}
+    integrals = {IntegralType.cell: [(0, ptrL, cells, active_coeffs, None)]}
     L = Form(
         _cpp.fem.Form_float64([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object)
     )

From c073221cc62f38067c34df9801dd7ac7c49edc6b Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Wed, 3 Dec 2025 21:49:08 +0100
Subject: [PATCH 04/17] add None as fifth argument to integrals

---
 python/demo/demo_static-condensation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/demo/demo_static-condensation.py b/python/demo/demo_static-condensation.py
index e6b0f6b5b2..ad0b868ca2 100644
--- a/python/demo/demo_static-condensation.py
+++ b/python/demo/demo_static-condensation.py
@@ -242,7 +242,7 @@ def tabulate_A(A_, w_, c_, coords_, entity_local_index, permutation=ffi.NULL, cu
 
 formtype = form_cpp_class(dtype)  # type: ignore
 cells = np.arange(msh.topology.index_map(msh.topology.dim).size_local)
-integrals = {IntegralType.cell: [(0, tabulate_A.address, cells, np.array([], dtype=np.int8))]}
+integrals = {IntegralType.cell: [(0, tabulate_A.address, cells, np.array([], dtype=np.int8), None)]}
 a_cond = Form(
     formtype([U._cpp_object, U._cpp_object], integrals, [], [], False, [], mesh=msh._cpp_object)
 )

From 8ea554b5a8512d2f307a0e1d3554f812d1281327 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Wed, 3 Dec 2025 22:05:28 +0100
Subject: [PATCH 05/17] add runtime quadrature rule test

---
 python/test/unit/fem/test_custom_data.py | 216 +++++++++++++++++++++++
 1 file changed, 216 insertions(+)

diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index bfe0f15579..75e8d57af5 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -544,3 +544,219 @@ def tabulate_per_cell_material(
     total_contribution = np.sum(b_hetero.array)
     expected_sum = np.sum(material_values) * (1.0 / num_cells)
     assert np.isclose(total_contribution, expected_sum)
+
+
+@pytest.mark.parametrize("dtype", [np.float64])
+def test_custom_data_runtime_quadrature(dtype):
+    """Test custom_data with per-cell runtime quadrature rules.
+
+    This test demonstrates the key use case of passing runtime-computed
+    quadrature points and weights to a kernel via custom_data. This enables:
+    - Adaptive quadrature based on solution features
+    - Different quadrature rules per cell (hp-adaptivity)
+    - Quadrature rules computed from external sources
+
+    The custom_data points to an array of quadrature rule data for each cell.
+    The kernel uses entity_local_index[0] (cell index) to look up the
+    quadrature rule for that specific cell.
+
+    Layout per cell: [num_points, xi_0, eta_0, w_0, xi_1, eta_1, w_1, ...]
+    We use fixed-size slots (max 3 points = 10 doubles per cell) for simplicity.
+    """
+    xdtype = np.real(dtype(0)).dtype
+
+    # Fixed slot size: 1 (num_points) + 3*3 (max 3 points with xi, eta, weight) = 10
+    SLOT_SIZE = 10
+
+    # Kernel that integrates using per-cell quadrature from custom_data
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate_with_per_cell_quadrature(
+        b_, w_, c_, coords_, entity_local_index, orientation, custom_data
+    ):
+        b = numba.carray(b_, (3), dtype=dtype)
+        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
+
+        # Get cell index from entity_local_index
+        cell_idx_ptr = numba.carray(entity_local_index, (1,), dtype=np.int32)
+        cell_idx = cell_idx_ptr[0]
+
+        # Read quadrature data for this cell from custom_data
+        # Layout: quad_data[cell_idx * SLOT_SIZE : (cell_idx + 1) * SLOT_SIZE]
+        quad_data = voidptr_to_float64_ptr(custom_data)
+        slot_start = cell_idx * 10  # SLOT_SIZE = 10
+        num_points = int(quad_data[slot_start])
+
+        # Get physical coordinates of triangle vertices
+        x0, y0 = coordinate_dofs[0, :2]
+        x1, y1 = coordinate_dofs[1, :2]
+        x2, y2 = coordinate_dofs[2, :2]
+
+        # Jacobian determinant (2 * area)
+        detJ = abs((x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0))
+
+        # Integrate: sum over quadrature points
+        b0 = dtype(0.0)
+        b1 = dtype(0.0)
+        b2 = dtype(0.0)
+
+        for q in range(num_points):
+            # Each quad point has: xi, eta, weight
+            xi = quad_data[slot_start + 1 + q * 3]
+            eta = quad_data[slot_start + 1 + q * 3 + 1]
+            w = quad_data[slot_start + 1 + q * 3 + 2]
+
+            # Basis function values at (xi, eta)
+            # N0 = 1 - xi - eta, N1 = xi, N2 = eta
+            N0 = 1.0 - xi - eta
+            N1 = xi
+            N2 = eta
+
+            # Accumulate: integral of N_i over element
+            b0 += N0 * w * detJ
+            b1 += N1 * w * detJ
+            b2 += N2 * w * detJ
+
+        b[0] = b0
+        b[1] = b1
+        b[2] = b2
+
+    mesh = create_unit_square(MPI.COMM_WORLD, 4, 4, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+
+    tdim = mesh.topology.dim
+    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
+    cells = np.arange(num_cells, dtype=np.int32)
+    active_coeffs = np.array([], dtype=np.int8)
+    formtype = form_cpp_class(dtype)
+
+    # Define quadrature rules
+    # 1-point centroid rule (exact for degree 1)
+    quad_1pt = [1.0, 1.0 / 3.0, 1.0 / 3.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+
+    # 3-point Gauss rule (exact for degree 2)
+    quad_3pt = [
+        3.0,
+        1.0 / 6.0,
+        1.0 / 6.0,
+        1.0 / 6.0,
+        2.0 / 3.0,
+        1.0 / 6.0,
+        1.0 / 6.0,
+        1.0 / 6.0,
+        2.0 / 3.0,
+        1.0 / 6.0,
+    ]
+
+    # Test 1: All cells use 1-point rule
+    quad_data_uniform_1pt = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
+    for c in range(num_cells):
+        quad_data_uniform_1pt[c, :] = quad_1pt
+
+    integrals_1pt = {
+        IntegralType.cell: [
+            (
+                0,
+                tabulate_with_per_cell_quadrature.address,
+                cells,
+                active_coeffs,
+                quad_data_uniform_1pt.ctypes.data,
+            )
+        ]
+    }
+    L_1pt = Form(formtype([V._cpp_object], integrals_1pt, [], [], False, [], mesh=mesh._cpp_object))
+
+    b_1pt = dolfinx.fem.assemble_vector(L_1pt)
+    b_1pt.scatter_reverse(la.InsertMode.add)
+    total_1pt = np.sum(b_1pt.array)
+
+    # Test 2: All cells use 3-point rule
+    quad_data_uniform_3pt = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
+    for c in range(num_cells):
+        quad_data_uniform_3pt[c, :] = quad_3pt
+
+    integrals_3pt = {
+        IntegralType.cell: [
+            (
+                0,
+                tabulate_with_per_cell_quadrature.address,
+                cells,
+                active_coeffs,
+                quad_data_uniform_3pt.ctypes.data,
+            )
+        ]
+    }
+    L_3pt = Form(formtype([V._cpp_object], integrals_3pt, [], [], False, [], mesh=mesh._cpp_object))
+
+    b_3pt = dolfinx.fem.assemble_vector(L_3pt)
+    b_3pt.scatter_reverse(la.InsertMode.add)
+    total_3pt = np.sum(b_3pt.array)
+
+    # Both should give exact integral = 1.0 for linear basis functions
+    assert np.isclose(total_1pt, 1.0, rtol=1e-10)
+    assert np.isclose(total_3pt, 1.0, rtol=1e-10)
+
+    # Test 3: Mixed quadrature - first half uses 1-point, second half uses 3-point
+    # This simulates adaptive quadrature where some cells need higher accuracy
+    quad_data_mixed = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
+    for c in range(num_cells):
+        if c < num_cells // 2:
+            quad_data_mixed[c, :] = quad_1pt  # Low-order cells
+        else:
+            quad_data_mixed[c, :] = quad_3pt  # High-order cells
+
+    integrals_mixed = {
+        IntegralType.cell: [
+            (
+                0,
+                tabulate_with_per_cell_quadrature.address,
+                cells,
+                active_coeffs,
+                quad_data_mixed.ctypes.data,
+            )
+        ]
+    }
+    L_mixed = Form(
+        formtype([V._cpp_object], integrals_mixed, [], [], False, [], mesh=mesh._cpp_object)
+    )
+
+    b_mixed = dolfinx.fem.assemble_vector(L_mixed)
+    b_mixed.scatter_reverse(la.InsertMode.add)
+    total_mixed = np.sum(b_mixed.array)
+
+    # Mixed quadrature should also give exact result for linear basis functions
+    assert np.isclose(total_mixed, 1.0, rtol=1e-10)
+
+    # Test 4: Verify per-cell access works by using scaled weights
+    # Cells in first half: weight scaled by 2.0, second half: normal weight
+    # This should NOT give exact integral but demonstrates per-cell differentiation
+    quad_data_scaled = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
+    for c in range(num_cells):
+        if c < num_cells // 2:
+            # Scaled 1-point rule (weight = 1.0 instead of 0.5)
+            quad_data_scaled[c, :] = [1.0, 1.0 / 3.0, 1.0 / 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        else:
+            # Normal 1-point rule (weight = 0.5)
+            quad_data_scaled[c, :] = quad_1pt
+
+    integrals_scaled = {
+        IntegralType.cell: [
+            (
+                0,
+                tabulate_with_per_cell_quadrature.address,
+                cells,
+                active_coeffs,
+                quad_data_scaled.ctypes.data,
+            )
+        ]
+    }
+    L_scaled = Form(
+        formtype([V._cpp_object], integrals_scaled, [], [], False, [], mesh=mesh._cpp_object)
+    )
+
+    b_scaled = dolfinx.fem.assemble_vector(L_scaled)
+    b_scaled.scatter_reverse(la.InsertMode.add)
+    total_scaled = np.sum(b_scaled.array)
+
+    # First half contributes 2x, second half contributes 1x
+    # Total = 0.5 * 2 + 0.5 * 1 = 1.5
+    assert np.isclose(total_scaled, 1.5, rtol=1e-10)

From f62e7de289f285bbda01ea40f96e88ad3ee05732 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Fri, 5 Dec 2025 14:47:16 +0100
Subject: [PATCH 06/17] Make sure that just custom_data is passed. No other
 changes

---
 cpp/dolfinx/fem/assemble_matrix_impl.h   |   2 +-
 cpp/dolfinx/fem/assemble_scalar_impl.h   |   2 +-
 cpp/dolfinx/fem/assemble_vector_impl.h   |   2 +-
 python/test/unit/fem/test_custom_data.py | 362 +----------------------
 4 files changed, 8 insertions(+), 360 deletions(-)

diff --git a/cpp/dolfinx/fem/assemble_matrix_impl.h b/cpp/dolfinx/fem/assemble_matrix_impl.h
index 0737a2463a..5b68e157a8 100644
--- a/cpp/dolfinx/fem/assemble_matrix_impl.h
+++ b/cpp/dolfinx/fem/assemble_matrix_impl.h
@@ -111,7 +111,7 @@ void assemble_cells_matrix(
 
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
-    kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(), &cell,
+    kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(), nullptr,
            nullptr, custom_data.value_or(nullptr));
 
     // Compute A = P_0 \tilde{A} P_1^T (dof transformation)
diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 4b305f8f4a..2e01619f83 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -50,7 +50,7 @@ T assemble_cells(mdspan2_t x_dofmap,
     for (std::size_t i = 0; i < x_dofs.size(); ++i)
       std::copy_n(&x(x_dofs[i], 0), 3, std::next(cdofs_b.begin(), 3 * i));
 
-    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), &c, nullptr,
+    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), nullptr, nullptr,
        custom_data.value_or(nullptr));
   }
 
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index 5e71b2b469..b85e17d89b 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -164,7 +164,7 @@ void _lift_bc_cells(
     auto dofs0 = md::submdspan(dmap0, c0, md::full_extent);
 
     std::ranges::fill(Ae, 0);
-    kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(), &c,
+    kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(), nullptr,
            nullptr, custom_data.value_or(nullptr));
     P0(Ae, cell_info0, c0, num_cols);
     P1T(Ae, cell_info1, c1, num_rows);
diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index 75e8d57af5..3471589bf8 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -6,55 +6,22 @@
 #
 # SPDX-License-Identifier:    LGPL-3.0-or-later
 
+from ffcx.codegeneration.utils import (
+    numba_ufcx_kernel_signature,
+    voidptr_to_float64_ptr,
+)
 from mpi4py import MPI
 
 import numpy as np
 import pytest
 
 import dolfinx
-import ffcx.codegeneration.utils
 from dolfinx import la
 from dolfinx.fem import Form, IntegralType, form_cpp_class, functionspace
 from dolfinx.mesh import create_unit_square
 
 numba = pytest.importorskip("numba")
-ufcx_signature = ffcx.codegeneration.utils.numba_ufcx_kernel_signature
-
-
-# Helper intrinsic to cast void* to a typed pointer for custom_data
-@numba.extending.intrinsic
-def voidptr_to_float64_ptr(typingctx, src):
-    """Cast a void pointer (CPointer(void)) to a float64 pointer.
-
-    This function is used to access custom_data passed through the UFCx
-    tabulate_tensor interface. Since custom_data is passed as void*, this
-    intrinsic allows casting it to a typed float64 pointer for element access.
-
-    Args:
-        typingctx: The typing context.
-        src: A void pointer (CPointer(void)) to cast.
-
-    Returns:
-        sig: A Numba signature returning CPointer(float64).
-        codegen: A code generation function that performs the bitcast.
-
-    Example:
-        Inside a Numba cfunc kernel::
-
-            typed_ptr = voidptr_to_float64_ptr(custom_data)
-            scale = typed_ptr[0]  # Access first float64 value
-    """
-    # Accept CPointer(void) which shows as 'none*' in numba type system
-    if isinstance(src, numba.types.CPointer) and src.dtype == numba.types.void:
-        sig = numba.types.CPointer(numba.types.float64)(src)
-
-        def codegen(context, builder, signature, args):
-            [src] = args
-            # Cast void* to float64*
-            dst_type = context.get_value_type(numba.types.CPointer(numba.types.float64))
-            return builder.bitcast(src, dst_type)
-
-        return sig, codegen
+ufcx_signature = numba_ufcx_kernel_signature
 
 
 def tabulate_rank1_with_custom_data(dtype, xdtype):
@@ -441,322 +408,3 @@ def tabulate_diffusion(b_, w_, c_, coords_, local_index, orientation, custom_dat
         expected_ratio = k / 1.0
         actual_ratio = results[i] / base_norm
         assert np.isclose(actual_ratio, expected_ratio, rtol=1e-10)
-
-
-@pytest.mark.parametrize("dtype", [np.float64])
-def test_custom_data_per_cell_material(dtype):
-    """Test custom_data with per-cell material properties using cell index.
-
-    This test demonstrates the use case where a kernel needs to access
-    cell-specific data. The cell index is now passed through entity_local_index
-    for cell integrals, allowing the kernel to look up per-cell values.
-
-    The custom_data pointer points to an array of values indexed by cell number.
-    The kernel uses entity_local_index[0] (which contains the cell index for
-    cell integrals) to look up the appropriate value.
-    """
-    xdtype = np.real(dtype(0)).dtype
-
-    # Kernel that reads per-cell material property from custom_data
-    # custom_data points to array: material_values[cell_index]
-    # entity_local_index[0] contains the cell index for cell integrals
-    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
-    def tabulate_per_cell_material(
-        b_, w_, c_, coords_, entity_local_index, cell_orientation, custom_data
-    ):
-        b = numba.carray(b_, (3), dtype=dtype)
-        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
-
-        # Cast void* to float64* - this points to per-cell material array
-        material_array = voidptr_to_float64_ptr(custom_data)
-
-        # entity_local_index[0] contains the cell index for cell integrals
-        cell_idx_ptr = numba.carray(entity_local_index, (1,), dtype=np.int32)
-        cell_idx = cell_idx_ptr[0]
-
-        # Look up the material property for this specific cell
-        material_value = material_array[cell_idx]
-
-        x0, y0 = coordinate_dofs[0, :2]
-        x1, y1 = coordinate_dofs[1, :2]
-        x2, y2 = coordinate_dofs[2, :2]
-
-        # 2x Element area Ae
-        Ae = abs((x0 - x1) * (y2 - y1) - (y0 - y1) * (x2 - x1))
-        b[:] = material_value * Ae / 6.0
-
-    mesh = create_unit_square(MPI.COMM_WORLD, 4, 4, dtype=xdtype)
-    V = functionspace(mesh, ("Lagrange", 1))
-
-    tdim = mesh.topology.dim
-    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
-    cells = np.arange(num_cells, dtype=np.int32)
-    active_coeffs = np.array([], dtype=np.int8)
-
-    # Create per-cell material array - all cells have material=1.0
-    material_values = np.ones(num_cells, dtype=dtype)
-
-    # Pass pointer to material array as custom_data
-    integrals = {
-        IntegralType.cell: [
-            (
-                0,
-                tabulate_per_cell_material.address,
-                cells,
-                active_coeffs,
-                material_values.ctypes.data,
-            )
-        ]
-    }
-    formtype = form_cpp_class(dtype)
-    L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
-
-    # Assemble with uniform material=1.0
-    b_uniform = dolfinx.fem.assemble_vector(L)
-    b_uniform.scatter_reverse(la.InsertMode.add)
-    norm_uniform = la.norm(b_uniform)
-
-    # Now set material=2.0 for all cells - should double the result
-    material_values[:] = 2.0
-    b_doubled = dolfinx.fem.assemble_vector(L)
-    b_doubled.scatter_reverse(la.InsertMode.add)
-    norm_doubled = la.norm(b_doubled)
-
-    assert np.isclose(norm_doubled, 2.0 * norm_uniform)
-
-    # Test heterogeneous material: first half of cells have material=1.0,
-    # second half have material=3.0
-    material_values[: num_cells // 2] = 1.0
-    material_values[num_cells // 2 :] = 3.0
-    b_hetero = dolfinx.fem.assemble_vector(L)
-    b_hetero.scatter_reverse(la.InsertMode.add)
-
-    # The result should be between uniform material=1.0 and material=3.0
-    norm_hetero = la.norm(b_hetero)
-    assert norm_uniform < norm_hetero < 3.0 * norm_uniform
-
-    # Verify the total contribution matches expected:
-    # The kernel computes Ae = 2*area (determinant formula), so:
-    # Each cell contributes material[i] * Ae / 6 = material[i] * area / 3 to each of 3 DOFs
-    # Total per cell = 3 * material[i] * area / 3 = material[i] * area
-    # Total = sum_i (material[i] * area_i)
-    # For uniform mesh on unit square with 4x4 grid: 32 triangles, each area 1/32
-    total_contribution = np.sum(b_hetero.array)
-    expected_sum = np.sum(material_values) * (1.0 / num_cells)
-    assert np.isclose(total_contribution, expected_sum)
-
-
-@pytest.mark.parametrize("dtype", [np.float64])
-def test_custom_data_runtime_quadrature(dtype):
-    """Test custom_data with per-cell runtime quadrature rules.
-
-    This test demonstrates the key use case of passing runtime-computed
-    quadrature points and weights to a kernel via custom_data. This enables:
-    - Adaptive quadrature based on solution features
-    - Different quadrature rules per cell (hp-adaptivity)
-    - Quadrature rules computed from external sources
-
-    The custom_data points to an array of quadrature rule data for each cell.
-    The kernel uses entity_local_index[0] (cell index) to look up the
-    quadrature rule for that specific cell.
-
-    Layout per cell: [num_points, xi_0, eta_0, w_0, xi_1, eta_1, w_1, ...]
-    We use fixed-size slots (max 3 points = 10 doubles per cell) for simplicity.
-    """
-    xdtype = np.real(dtype(0)).dtype
-
-    # Fixed slot size: 1 (num_points) + 3*3 (max 3 points with xi, eta, weight) = 10
-    SLOT_SIZE = 10
-
-    # Kernel that integrates using per-cell quadrature from custom_data
-    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
-    def tabulate_with_per_cell_quadrature(
-        b_, w_, c_, coords_, entity_local_index, orientation, custom_data
-    ):
-        b = numba.carray(b_, (3), dtype=dtype)
-        coordinate_dofs = numba.carray(coords_, (3, 3), dtype=xdtype)
-
-        # Get cell index from entity_local_index
-        cell_idx_ptr = numba.carray(entity_local_index, (1,), dtype=np.int32)
-        cell_idx = cell_idx_ptr[0]
-
-        # Read quadrature data for this cell from custom_data
-        # Layout: quad_data[cell_idx * SLOT_SIZE : (cell_idx + 1) * SLOT_SIZE]
-        quad_data = voidptr_to_float64_ptr(custom_data)
-        slot_start = cell_idx * 10  # SLOT_SIZE = 10
-        num_points = int(quad_data[slot_start])
-
-        # Get physical coordinates of triangle vertices
-        x0, y0 = coordinate_dofs[0, :2]
-        x1, y1 = coordinate_dofs[1, :2]
-        x2, y2 = coordinate_dofs[2, :2]
-
-        # Jacobian determinant (2 * area)
-        detJ = abs((x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0))
-
-        # Integrate: sum over quadrature points
-        b0 = dtype(0.0)
-        b1 = dtype(0.0)
-        b2 = dtype(0.0)
-
-        for q in range(num_points):
-            # Each quad point has: xi, eta, weight
-            xi = quad_data[slot_start + 1 + q * 3]
-            eta = quad_data[slot_start + 1 + q * 3 + 1]
-            w = quad_data[slot_start + 1 + q * 3 + 2]
-
-            # Basis function values at (xi, eta)
-            # N0 = 1 - xi - eta, N1 = xi, N2 = eta
-            N0 = 1.0 - xi - eta
-            N1 = xi
-            N2 = eta
-
-            # Accumulate: integral of N_i over element
-            b0 += N0 * w * detJ
-            b1 += N1 * w * detJ
-            b2 += N2 * w * detJ
-
-        b[0] = b0
-        b[1] = b1
-        b[2] = b2
-
-    mesh = create_unit_square(MPI.COMM_WORLD, 4, 4, dtype=xdtype)
-    V = functionspace(mesh, ("Lagrange", 1))
-
-    tdim = mesh.topology.dim
-    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
-    cells = np.arange(num_cells, dtype=np.int32)
-    active_coeffs = np.array([], dtype=np.int8)
-    formtype = form_cpp_class(dtype)
-
-    # Define quadrature rules
-    # 1-point centroid rule (exact for degree 1)
-    quad_1pt = [1.0, 1.0 / 3.0, 1.0 / 3.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-
-    # 3-point Gauss rule (exact for degree 2)
-    quad_3pt = [
-        3.0,
-        1.0 / 6.0,
-        1.0 / 6.0,
-        1.0 / 6.0,
-        2.0 / 3.0,
-        1.0 / 6.0,
-        1.0 / 6.0,
-        1.0 / 6.0,
-        2.0 / 3.0,
-        1.0 / 6.0,
-    ]
-
-    # Test 1: All cells use 1-point rule
-    quad_data_uniform_1pt = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
-    for c in range(num_cells):
-        quad_data_uniform_1pt[c, :] = quad_1pt
-
-    integrals_1pt = {
-        IntegralType.cell: [
-            (
-                0,
-                tabulate_with_per_cell_quadrature.address,
-                cells,
-                active_coeffs,
-                quad_data_uniform_1pt.ctypes.data,
-            )
-        ]
-    }
-    L_1pt = Form(formtype([V._cpp_object], integrals_1pt, [], [], False, [], mesh=mesh._cpp_object))
-
-    b_1pt = dolfinx.fem.assemble_vector(L_1pt)
-    b_1pt.scatter_reverse(la.InsertMode.add)
-    total_1pt = np.sum(b_1pt.array)
-
-    # Test 2: All cells use 3-point rule
-    quad_data_uniform_3pt = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
-    for c in range(num_cells):
-        quad_data_uniform_3pt[c, :] = quad_3pt
-
-    integrals_3pt = {
-        IntegralType.cell: [
-            (
-                0,
-                tabulate_with_per_cell_quadrature.address,
-                cells,
-                active_coeffs,
-                quad_data_uniform_3pt.ctypes.data,
-            )
-        ]
-    }
-    L_3pt = Form(formtype([V._cpp_object], integrals_3pt, [], [], False, [], mesh=mesh._cpp_object))
-
-    b_3pt = dolfinx.fem.assemble_vector(L_3pt)
-    b_3pt.scatter_reverse(la.InsertMode.add)
-    total_3pt = np.sum(b_3pt.array)
-
-    # Both should give exact integral = 1.0 for linear basis functions
-    assert np.isclose(total_1pt, 1.0, rtol=1e-10)
-    assert np.isclose(total_3pt, 1.0, rtol=1e-10)
-
-    # Test 3: Mixed quadrature - first half uses 1-point, second half uses 3-point
-    # This simulates adaptive quadrature where some cells need higher accuracy
-    quad_data_mixed = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
-    for c in range(num_cells):
-        if c < num_cells // 2:
-            quad_data_mixed[c, :] = quad_1pt  # Low-order cells
-        else:
-            quad_data_mixed[c, :] = quad_3pt  # High-order cells
-
-    integrals_mixed = {
-        IntegralType.cell: [
-            (
-                0,
-                tabulate_with_per_cell_quadrature.address,
-                cells,
-                active_coeffs,
-                quad_data_mixed.ctypes.data,
-            )
-        ]
-    }
-    L_mixed = Form(
-        formtype([V._cpp_object], integrals_mixed, [], [], False, [], mesh=mesh._cpp_object)
-    )
-
-    b_mixed = dolfinx.fem.assemble_vector(L_mixed)
-    b_mixed.scatter_reverse(la.InsertMode.add)
-    total_mixed = np.sum(b_mixed.array)
-
-    # Mixed quadrature should also give exact result for linear basis functions
-    assert np.isclose(total_mixed, 1.0, rtol=1e-10)
-
-    # Test 4: Verify per-cell access works by using scaled weights
-    # Cells in first half: weight scaled by 2.0, second half: normal weight
-    # This should NOT give exact integral but demonstrates per-cell differentiation
-    quad_data_scaled = np.zeros((num_cells, SLOT_SIZE), dtype=dtype)
-    for c in range(num_cells):
-        if c < num_cells // 2:
-            # Scaled 1-point rule (weight = 1.0 instead of 0.5)
-            quad_data_scaled[c, :] = [1.0, 1.0 / 3.0, 1.0 / 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-        else:
-            # Normal 1-point rule (weight = 0.5)
-            quad_data_scaled[c, :] = quad_1pt
-
-    integrals_scaled = {
-        IntegralType.cell: [
-            (
-                0,
-                tabulate_with_per_cell_quadrature.address,
-                cells,
-                active_coeffs,
-                quad_data_scaled.ctypes.data,
-            )
-        ]
-    }
-    L_scaled = Form(
-        formtype([V._cpp_object], integrals_scaled, [], [], False, [], mesh=mesh._cpp_object)
-    )
-
-    b_scaled = dolfinx.fem.assemble_vector(L_scaled)
-    b_scaled.scatter_reverse(la.InsertMode.add)
-    total_scaled = np.sum(b_scaled.array)
-
-    # First half contributes 2x, second half contributes 1x
-    # Total = 0.5 * 2 + 0.5 * 1 = 1.5
-    assert np.isclose(total_scaled, 1.5, rtol=1e-10)

From 310f93ec886f13686c7f74548db517023368afb6 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Fri, 5 Dec 2025 15:11:16 +0100
Subject: [PATCH 07/17] fix ruff and add ffcx branch for custom_data ptr
 casting

---
 .github/workflows/fenicsx-refs.env       | 2 +-
 python/test/unit/fem/test_custom_data.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/fenicsx-refs.env b/.github/workflows/fenicsx-refs.env
index 22c7e44a7e..693a736666 100644
--- a/.github/workflows/fenicsx-refs.env
+++ b/.github/workflows/fenicsx-refs.env
@@ -3,4 +3,4 @@ basix_ref=main
 ufl_repository=FEniCS/ufl
 ufl_ref=main
 ffcx_repository=FEniCS/ffcx
-ffcx_ref=main
+ffcx_ref=sclaus2/add_voidptr_cast_intrinsic
diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index 3471589bf8..1cd3999cdc 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -6,10 +6,6 @@
 #
 # SPDX-License-Identifier:    LGPL-3.0-or-later
 
-from ffcx.codegeneration.utils import (
-    numba_ufcx_kernel_signature,
-    voidptr_to_float64_ptr,
-)
 from mpi4py import MPI
 
 import numpy as np
@@ -19,6 +15,10 @@
 from dolfinx import la
 from dolfinx.fem import Form, IntegralType, form_cpp_class, functionspace
 from dolfinx.mesh import create_unit_square
+from ffcx.codegeneration.utils import (
+    numba_ufcx_kernel_signature,
+    voidptr_to_float64_ptr,
+)
 
 numba = pytest.importorskip("numba")
 ufcx_signature = numba_ufcx_kernel_signature

From 2bfcf1a8405e1c7b15ab258f2d5b24afe956c122 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Fri, 5 Dec 2025 15:37:41 +0100
Subject: [PATCH 08/17] fix clang-format

---
 cpp/dolfinx/fem/assemble_scalar_impl.h | 4 ++--
 cpp/dolfinx/fem/assemble_vector_impl.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 2e01619f83..016c3a9bd4 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -50,8 +50,8 @@ T assemble_cells(mdspan2_t x_dofmap,
     for (std::size_t i = 0; i < x_dofs.size(); ++i)
       std::copy_n(&x(x_dofs[i], 0), 3, std::next(cdofs_b.begin(), 3 * i));
 
-    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), nullptr, nullptr,
-       custom_data.value_or(nullptr));
+    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), nullptr,
+       nullptr, custom_data.value_or(nullptr));
   }
 
   return value;
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index b85e17d89b..f8dcb7faab 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -164,8 +164,8 @@ void _lift_bc_cells(
     auto dofs0 = md::submdspan(dmap0, c0, md::full_extent);
 
     std::ranges::fill(Ae, 0);
-    kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(), nullptr,
-           nullptr, custom_data.value_or(nullptr));
+    kernel(Ae.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
+           nullptr, nullptr, custom_data.value_or(nullptr));
     P0(Ae, cell_info0, c0, num_cols);
     P1T(Ae, cell_info1, c1, num_rows);
 

From 086469928465015719301f0ec0b2d6ea4460b334 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Fri, 5 Dec 2025 16:01:54 +0100
Subject: [PATCH 09/17] fix docs

---
 cpp/dolfinx/fem/assemble_scalar_impl.h | 2 ++
 cpp/dolfinx/fem/assemble_vector_impl.h | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 016c3a9bd4..75d9b7cf1d 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -67,6 +67,7 @@ T assemble_cells(mdspan2_t x_dofmap,
 /// However, entities may be attached to more than one cell. This function
 /// therefore computes 'one-sided' integrals, i.e. evaluates integrals as seen
 /// from cell used to define the entity.
+/// @param custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
 template <dolfinx::scalar T>
 T assemble_entities(
     mdspan2_t x_dofmap,
@@ -109,6 +110,7 @@ T assemble_entities(
 }
 
 /// Assemble functional over interior facets
+/// @param custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
 template <dolfinx::scalar T>
 T assemble_interior_facets(
     mdspan2_t x_dofmap,
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index f8dcb7faab..ee42866781 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -76,6 +76,7 @@ using mdspan2_t = md::mdspan<const std::int32_t, md::dextents<std::size_t, 2>>;
 /// conditions applied.
 /// @param[in] x0 Vector used in the lifting.
 /// @param[in] alpha Scaling to apply.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
 template <int _bs0 = -1, int _bs1 = -1, typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>
@@ -261,6 +262,7 @@ void _lift_bc_cells(
 /// local_facet_idx)` is the permutation value for the facet attached to
 /// the cell `cell_idx` with local index `local_facet_idx` relative to
 /// the cell. Empty if facet permutations are not required.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
 template <typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>
@@ -417,6 +419,7 @@ void _lift_bc_entities(
 /// local_facet_idx)` is the permutation value for the facet attached to
 /// the cell `cell_idx` with local index `local_facet_idx` relative to
 /// the cell. Empty if facet permutations are not required.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
 template <typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>

From 709b0f42e531ca58590a48162b8d5b9371de6d01 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Fri, 5 Dec 2025 16:05:32 +0100
Subject: [PATCH 10/17] fix clang

---
 cpp/dolfinx/fem/assemble_scalar_impl.h | 6 ++++--
 cpp/dolfinx/fem/assemble_vector_impl.h | 9 ++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 75d9b7cf1d..693baad770 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -67,7 +67,8 @@ T assemble_cells(mdspan2_t x_dofmap,
 /// However, entities may be attached to more than one cell. This function
 /// therefore computes 'one-sided' integrals, i.e. evaluates integrals as seen
 /// from cell used to define the entity.
-/// @param custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
+/// @param custom_data Optional pointer to user-supplied data passed to the
+/// kernel at runtime.
 template <dolfinx::scalar T>
 T assemble_entities(
     mdspan2_t x_dofmap,
@@ -110,7 +111,8 @@ T assemble_entities(
 }
 
 /// Assemble functional over interior facets
-/// @param custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
+/// @param custom_data Optional pointer to user-supplied data passed to the
+/// kernel at runtime.
 template <dolfinx::scalar T>
 T assemble_interior_facets(
     mdspan2_t x_dofmap,
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index ee42866781..162f9f74eb 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -76,7 +76,8 @@ using mdspan2_t = md::mdspan<const std::int32_t, md::dextents<std::size_t, 2>>;
 /// conditions applied.
 /// @param[in] x0 Vector used in the lifting.
 /// @param[in] alpha Scaling to apply.
-/// @param[in] custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to the
+/// kernel at runtime.
 template <int _bs0 = -1, int _bs1 = -1, typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>
@@ -262,7 +263,8 @@ void _lift_bc_cells(
 /// local_facet_idx)` is the permutation value for the facet attached to
 /// the cell `cell_idx` with local index `local_facet_idx` relative to
 /// the cell. Empty if facet permutations are not required.
-/// @param[in] custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to the
+/// kernel at runtime.
 template <typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>
@@ -419,7 +421,8 @@ void _lift_bc_entities(
 /// local_facet_idx)` is the permutation value for the facet attached to
 /// the cell `cell_idx` with local index `local_facet_idx` relative to
 /// the cell. Empty if facet permutations are not required.
-/// @param[in] custom_data Optional pointer to user-supplied data passed to the kernel at runtime.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to the
+/// kernel at runtime.
 template <typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>

From dd72a4960879d78c8afaa6c9a8dce25158a4ba36 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Fri, 5 Dec 2025 16:27:43 +0100
Subject: [PATCH 11/17] more doc fixes

---
 cpp/dolfinx/fem/assemble_matrix_impl.h | 2 ++
 cpp/dolfinx/fem/assemble_scalar_impl.h | 4 ----
 cpp/dolfinx/fem/assemble_vector_impl.h | 6 ++++++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/dolfinx/fem/assemble_matrix_impl.h b/cpp/dolfinx/fem/assemble_matrix_impl.h
index 5b68e157a8..ceb505ddb2 100644
--- a/cpp/dolfinx/fem/assemble_matrix_impl.h
+++ b/cpp/dolfinx/fem/assemble_matrix_impl.h
@@ -343,6 +343,8 @@ void assemble_entities(
 /// function mesh.
 /// @param[in] perms Facet permutation integer. Empty if facet
 /// permutations are not required.
+/// @param[in] custom_data Optional pointer to user-supplied data passed
+/// to the kernel at runtime.
 template <dolfinx::scalar T>
 void assemble_interior_facets(
     la::MatSet<T> auto mat_set, mdspan2_t x_dofmap,
diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 693baad770..016c3a9bd4 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -67,8 +67,6 @@ T assemble_cells(mdspan2_t x_dofmap,
 /// However, entities may be attached to more than one cell. This function
 /// therefore computes 'one-sided' integrals, i.e. evaluates integrals as seen
 /// from cell used to define the entity.
-/// @param custom_data Optional pointer to user-supplied data passed to the
-/// kernel at runtime.
 template <dolfinx::scalar T>
 T assemble_entities(
     mdspan2_t x_dofmap,
@@ -111,8 +109,6 @@ T assemble_entities(
 }
 
 /// Assemble functional over interior facets
-/// @param custom_data Optional pointer to user-supplied data passed to the
-/// kernel at runtime.
 template <dolfinx::scalar T>
 T assemble_interior_facets(
     mdspan2_t x_dofmap,
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index 162f9f74eb..39ab35adcf 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -667,6 +667,8 @@ void _lift_bc_interior_facets(
 /// coefficient for cell `i`.
 /// @param[in] cell_info0 Cell permutation information for the test
 /// function mesh.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to
+/// the kernel at runtime.
 template <int _bs = -1, typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>
@@ -761,6 +763,8 @@ void assemble_cells(
 /// function mesh.
 /// @param[in] perms Entity permutation integer. Empty if entity
 /// permutations are not required.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to
+/// the kernel at runtime.
 template <int _bs = -1, typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>
@@ -857,6 +861,8 @@ void assemble_entities(
 /// function mesh.
 /// @param[in] perms Facet permutation integer. Empty if facet
 /// permutations are not required.
+/// @param[in] custom_data Optional pointer to user-supplied data passed to
+/// the kernel at runtime.
 template <int _bs = -1, typename V,
           dolfinx::scalar T = typename std::remove_cvref_t<V>::value_type>
   requires std::is_same_v<typename std::remove_cvref_t<V>::value_type, T>

From 487fb2e508705768bc815811dd41b59747b20519 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Fri, 5 Dec 2025 17:46:28 +0100
Subject: [PATCH 12/17] Fix tests in parallel

---
 python/test/unit/fem/test_custom_data.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index 1cd3999cdc..4df6675592 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -237,8 +237,10 @@ def tabulate_with_struct(b_, w_, c_, coords_, local_index, orientation, custom_d
     V = functionspace(mesh, ("Lagrange", 1))
 
     tdim = mesh.topology.dim
-    num_cells = mesh.topology.index_map(tdim).size_local + mesh.topology.index_map(tdim).num_ghosts
-    cells = np.arange(num_cells, dtype=np.int32)
+    # Only iterate over local cells (not ghosts) to avoid double-counting
+    # contributions after scatter_reverse
+    num_local_cells = mesh.topology.index_map(tdim).size_local
+    cells = np.arange(num_local_cells, dtype=np.int32)
     active_coeffs = np.array([], dtype=np.int8)
 
     # Test 1: scale=1.0, offset=0.0 (baseline)
@@ -272,9 +274,12 @@ def tabulate_with_struct(b_, w_, c_, coords_, local_index, orientation, custom_d
     b_offset.scatter_reverse(la.InsertMode.add)
     # With offset=1.0, each DOF gets contribution from each cell it touches
     # Interior nodes touch 6 cells, edge nodes touch 3-4, corner nodes touch 1-2
-    # The sum of all contributions equals 3 * num_cells (3 DOFs per cell, offset=1.0 each)
-    total_contribution = np.sum(b_offset.array)
-    assert np.isclose(total_contribution, 3.0 * num_cells)
+    # The sum of all contributions equals 3 * num_local_cells (3 DOFs per cell, offset=1.0 each)
+    # Sum only local DOFs and gather across processes
+    local_sum = np.sum(b_offset.array[: V.dofmap.index_map.size_local * V.dofmap.index_map_bs])
+    total_contribution = mesh.comm.allreduce(local_sum, op=MPI.SUM)
+    total_cells = mesh.comm.allreduce(num_local_cells, op=MPI.SUM)
+    assert np.isclose(total_contribution, 3.0 * total_cells)
 
 
 @pytest.mark.parametrize("dtype", [np.float64])

From 084a5da69383fa6f07551a621809158e83d2e1ec Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Tue, 12 May 2026 00:23:16 +0200
Subject: [PATCH 13/17] fix ruff check

---
 python/test/unit/fem/test_custom_data.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index dd72e87917..509c666813 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -37,7 +37,6 @@ def tabulate_rank1_with_custom_data(dtype, xdtype):
 
     Note: custom_data must be set to a valid pointer before assembly.
     """
-
     voidptr_to_scalar_ptr = voidptr_to_dtype_ptr(dtype)
 
     @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
@@ -65,7 +64,6 @@ def tabulate_rank2_with_custom_data(dtype, xdtype):
 
     Note: custom_data must be set to a valid pointer before assembly.
     """
-
     voidptr_to_scalar_ptr = voidptr_to_dtype_ptr(dtype)
 
     @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)

From c5df0c3ce6ab388a428aa47ca2a86abb1f8c265d Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Tue, 12 May 2026 00:26:38 +0200
Subject: [PATCH 14/17] Format custom data expression helper

---
 cpp/dolfinx/fem/utils.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/dolfinx/fem/utils.h b/cpp/dolfinx/fem/utils.h
index ddc62c2b68..7d15226319 100644
--- a/cpp/dolfinx/fem/utils.h
+++ b/cpp/dolfinx/fem/utils.h
@@ -1059,7 +1059,8 @@ Expression<T, U> create_expression(
     }
   }
 
-  return create_expression(e, coeff_map, const_map, argument_space, custom_data);
+  return create_expression(e, coeff_map, const_map, argument_space,
+                           custom_data);
 }
 
 } // namespace dolfinx::fem

From bf90fc49dcc76ef03654a8e702111023f5fb7996 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Tue, 12 May 2026 00:31:06 +0200
Subject: [PATCH 15/17] clang formatting

---
 .../dolfinx/wrappers/dolfinx_wrappers/fem.h   | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/dolfinx/wrappers/dolfinx_wrappers/fem.h b/python/dolfinx/wrappers/dolfinx_wrappers/fem.h
index 6a2f288e3b..aeb36ec91f 100644
--- a/python/dolfinx/wrappers/dolfinx_wrappers/fem.h
+++ b/python/dolfinx/wrappers/dolfinx_wrappers/fem.h
@@ -681,7 +681,8 @@ void declare_objects(nb::module_& m, std::string type)
              const std::vector<
                  std::shared_ptr<const dolfinx::fem::Constant<T>>>& constants,
              nb::ndarray<const U, nb::ndim<2>, nb::c_contig> X,
-             std::uintptr_t fn_addr, const std::vector<std::size_t>& value_shape,
+             std::uintptr_t fn_addr,
+             const std::vector<std::size_t>& value_shape,
              std::shared_ptr<const dolfinx::fem::FunctionSpace<U>>
                  argument_space,
              std::optional<std::uintptr_t> custom_data)
@@ -701,16 +702,15 @@ void declare_objects(nb::module_& m, std::string type)
           nb::arg("coefficients"), nb::arg("constants"), nb::arg("X"),
           nb::arg("fn"), nb::arg("value_shape"), nb::arg("argument_space"),
           nb::arg("custom_data").none() = nb::none())
-      .def(
-          "custom_data",
-          [](const dolfinx::fem::Expression<T, U>& self)
-              -> std::optional<std::uintptr_t>
-          {
-            std::optional<void*> data = self.custom_data();
-            if (!data)
-              return std::nullopt;
-            return reinterpret_cast<std::uintptr_t>(*data);
-          })
+      .def("custom_data",
+           [](const dolfinx::fem::Expression<T, U>& self)
+               -> std::optional<std::uintptr_t>
+           {
+             std::optional<void*> data = self.custom_data();
+             if (!data)
+               return std::nullopt;
+             return reinterpret_cast<std::uintptr_t>(*data);
+           })
       .def("X",
            [](const dolfinx::fem::Expression<T, U>& self)
            {
@@ -872,8 +872,8 @@ void declare_form(nb::module_& m, std::string type)
       .def(
           "custom_data",
           [](const dolfinx::fem::Form<T, U>& self,
-             dolfinx::fem::IntegralType type, int idx, int kernel_idx)
-              -> std::optional<std::uintptr_t>
+             dolfinx::fem::IntegralType type, int idx,
+             int kernel_idx) -> std::optional<std::uintptr_t>
           {
             std::optional<void*> data = self.custom_data(type, idx, kernel_idx);
             if (!data)

From c2bd5c47868c99f9f5755f1f1c66374ecae01bdf Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Tue, 12 May 2026 00:40:36 +0200
Subject: [PATCH 16/17] fix import dolfinx in test_custom_data

---
 python/test/unit/fem/test_custom_data.py | 32 ++++++++++++++----------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index 509c666813..3d77147cc1 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -11,10 +11,16 @@
 import numpy as np
 import pytest
 
-import dolfinx
 import ffcx.codegeneration.utils as codegen_utils
 from dolfinx import la
-from dolfinx.fem import Form, IntegralType, form_cpp_class, functionspace
+from dolfinx.fem import (
+    Form,
+    IntegralType,
+    assemble_matrix,
+    assemble_vector,
+    form_cpp_class,
+    functionspace,
+)
 from dolfinx.mesh import create_unit_square
 
 numba = pytest.importorskip("numba")
@@ -113,7 +119,7 @@ def test_custom_data_vector_assembly(dtype):
     L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
     # Assemble with scale=1.0
-    b1 = dolfinx.fem.assemble_vector(L)
+    b1 = assemble_vector(L)
     b1.scatter_reverse(la.InsertMode.add)
     norm1 = la.norm(b1)
 
@@ -122,7 +128,7 @@ def test_custom_data_vector_assembly(dtype):
 
     # Update custom_data to scale=2.0 (by modifying the underlying array)
     scale_value[0] = 2.0
-    b2 = dolfinx.fem.assemble_vector(L)
+    b2 = assemble_vector(L)
     b2.scatter_reverse(la.InsertMode.add)
     norm2 = la.norm(b2)
 
@@ -131,7 +137,7 @@ def test_custom_data_vector_assembly(dtype):
 
     # Test with scale=3.0
     scale_value[0] = 3.0
-    b3 = dolfinx.fem.assemble_vector(L)
+    b3 = assemble_vector(L)
     b3.scatter_reverse(la.InsertMode.add)
     norm3 = la.norm(b3)
 
@@ -171,13 +177,13 @@ def test_custom_data_matrix_assembly(dtype):
     )
 
     # Assemble with scale=1.0
-    A1 = dolfinx.fem.assemble_matrix(a)
+    A1 = assemble_matrix(a)
     A1.scatter_reverse()
     norm1 = np.sqrt(A1.squared_norm())
 
     # Update custom_data to scale=2.0 (by modifying the underlying array)
     scale_value[0] = 2.0
-    A2 = dolfinx.fem.assemble_matrix(a)
+    A2 = assemble_matrix(a)
     A2.scatter_reverse()
     norm2 = np.sqrt(A2.squared_norm())
 
@@ -267,14 +273,14 @@ def tabulate_with_struct(b_, w_, c_, coords_, local_index, orientation, custom_d
     formtype = form_cpp_class(dtype)
     L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
-    b_baseline = dolfinx.fem.assemble_vector(L)
+    b_baseline = assemble_vector(L)
     b_baseline.scatter_reverse(la.InsertMode.add)
     norm_baseline = la.norm(b_baseline)
 
     # Test 2: scale=2.0, offset=0.0 - should double the norm
     struct_data[0] = 2.0
     struct_data[1] = 0.0
-    b_scaled = dolfinx.fem.assemble_vector(L)
+    b_scaled = assemble_vector(L)
     b_scaled.scatter_reverse(la.InsertMode.add)
     norm_scaled = la.norm(b_scaled)
     assert np.isclose(norm_scaled, 2.0 * norm_baseline)
@@ -282,7 +288,7 @@ def tabulate_with_struct(b_, w_, c_, coords_, local_index, orientation, custom_d
     # Test 3: scale=0.0, offset=1.0 - pure offset contribution
     struct_data[0] = 0.0
     struct_data[1] = 1.0
-    b_offset = dolfinx.fem.assemble_vector(L)
+    b_offset = assemble_vector(L)
     b_offset.scatter_reverse(la.InsertMode.add)
     # With offset=1.0, each DOF gets contribution from each cell it touches
     # Interior nodes touch 6 cells, edge nodes touch 3-4, corner nodes touch 1-2
@@ -348,14 +354,14 @@ def tabulate_with_params(b_, w_, c_, coords_, local_index, orientation, custom_d
     formtype = form_cpp_class(dtype)
     L = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
 
-    b1 = dolfinx.fem.assemble_vector(L)
+    b1 = assemble_vector(L)
     b1.scatter_reverse(la.InsertMode.add)
     norm1 = la.norm(b1)
 
     # Change parameters: coeff=1, power=6, additive=0 (should give same result: 1*6 = 2*3)
     params[0] = 1.0
     params[1] = 6.0
-    b2 = dolfinx.fem.assemble_vector(L)
+    b2 = assemble_vector(L)
     b2.scatter_reverse(la.InsertMode.add)
     norm2 = la.norm(b2)
 
@@ -418,7 +424,7 @@ def tabulate_diffusion(b_, w_, c_, coords_, local_index, orientation, custom_dat
 
     for k in kappa_values:
         kappa[0] = k
-        b = dolfinx.fem.assemble_vector(L)
+        b = assemble_vector(L)
         b.scatter_reverse(la.InsertMode.add)
         results.append(la.norm(b))
 

From ab1ff71a2597791c30c6907b63c24d32f1311be0 Mon Sep 17 00:00:00 2001
From: Susanne Claus <susanne.claus@onera.fr>
Date: Tue, 12 May 2026 14:06:33 +0200
Subject: [PATCH 17/17] pass loop index to integration kernel for runtime
 integration

---
 cpp/dolfinx/fem/assemble_expression_impl.h |   9 +-
 cpp/dolfinx/fem/assemble_matrix_impl.h     |  14 +-
 cpp/dolfinx/fem/assemble_scalar_impl.h     |  16 +-
 cpp/dolfinx/fem/assemble_vector_impl.h     |  14 +-
 python/test/unit/fem/test_custom_data.py   | 177 ++++++++++++++++++++-
 5 files changed, 214 insertions(+), 16 deletions(-)

diff --git a/cpp/dolfinx/fem/assemble_expression_impl.h b/cpp/dolfinx/fem/assemble_expression_impl.h
index 885dc4504b..da4192a847 100644
--- a/cpp/dolfinx/fem/assemble_expression_impl.h
+++ b/cpp/dolfinx/fem/assemble_expression_impl.h
@@ -11,6 +11,7 @@
 #include "traits.h"
 #include "utils.h"
 #include <algorithm>
+#include <array>
 #include <basix/mdspan.hpp>
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/mesh/Geometry.h>
@@ -95,8 +96,10 @@ void tabulate_expression(
         std::copy_n(std::next(x.begin(), 3 * x_dofs[i]), 3,
                     std::next(coord_dofs.begin(), 3 * i));
       }
+      std::int32_t entity_local_index = static_cast<std::int32_t>(e);
       fn(values_local.data(), &coeffs(e, 0), constants.data(),
-         coord_dofs.data(), nullptr, nullptr, custom_data.value_or(nullptr));
+         coord_dofs.data(), &entity_local_index, nullptr,
+         custom_data.value_or(nullptr));
 
       P0(values_local, cell_info, entity, size0);
     }
@@ -109,8 +112,10 @@ void tabulate_expression(
         std::copy_n(std::next(x.begin(), 3 * x_dofs[i]), 3,
                     std::next(coord_dofs.begin(), 3 * i));
       }
+      std::array<std::int32_t, 2> entity_local_index{
+          entities(e, 1), static_cast<std::int32_t>(e)};
       fn(values_local.data(), &coeffs(e, 0), constants.data(),
-         coord_dofs.data(), &entities(e, 1), nullptr,
+         coord_dofs.data(), entity_local_index.data(), nullptr,
          custom_data.value_or(nullptr));
 
       P0(values_local, cell_info, entity, size0);
diff --git a/cpp/dolfinx/fem/assemble_matrix_impl.h b/cpp/dolfinx/fem/assemble_matrix_impl.h
index e0ed682ca9..5be0094257 100644
--- a/cpp/dolfinx/fem/assemble_matrix_impl.h
+++ b/cpp/dolfinx/fem/assemble_matrix_impl.h
@@ -12,6 +12,7 @@
 #include "traits.h"
 #include "utils.h"
 #include <algorithm>
+#include <array>
 #include <dolfinx/la/utils.h>
 #include <dolfinx/mesh/Geometry.h>
 #include <dolfinx/mesh/Mesh.h>
@@ -136,8 +137,9 @@ void assemble_cells_matrix(
 
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
-    kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(), nullptr,
-           nullptr, custom_data.value_or(nullptr));
+    std::int32_t entity_local_index = static_cast<std::int32_t>(c);
+    kernel(Ae.data(), &coeffs(c, 0), constants.data(), cdofs.data(),
+           &entity_local_index, nullptr, custom_data.value_or(nullptr));
 
     // Compute A = P_0 \tilde{A} P_1^T (dof transformation)
     P0(Ae, cell_info0, cell0, ndim1);  // B = P0 \tilde{A}
@@ -314,8 +316,10 @@ void assemble_entities(
 
     // Tabulate tensor
     std::ranges::fill(Ae, 0);
+    std::array<std::int32_t, 2> entity_local_index{
+        local_entity, static_cast<std::int32_t>(f)};
     kernel(Ae.data(), &coeffs(f, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, custom_data.value_or(nullptr));
+           entity_local_index.data(), &perm, custom_data.value_or(nullptr));
     P0(Ae, cell_info0, cell0, ndim1);
     P1T(Ae, cell_info1, cell1, ndim0);
 
@@ -522,8 +526,10 @@ void assemble_interior_facets(
                           ? std::array<std::uint8_t, 2>{0, 0}
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
+    std::array<std::int32_t, 3> entity_local_index{
+        local_facet[0], local_facet[1], static_cast<std::int32_t>(f)};
     kernel(Ae.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), custom_data.value_or(nullptr));
+           entity_local_index.data(), perm.data(), custom_data.value_or(nullptr));
 
     // Local element layout is a 2x2 block matrix with structure
     //
diff --git a/cpp/dolfinx/fem/assemble_scalar_impl.h b/cpp/dolfinx/fem/assemble_scalar_impl.h
index 016c3a9bd4..c609dde704 100644
--- a/cpp/dolfinx/fem/assemble_scalar_impl.h
+++ b/cpp/dolfinx/fem/assemble_scalar_impl.h
@@ -11,6 +11,7 @@
 #include "FunctionSpace.h"
 #include "utils.h"
 #include <algorithm>
+#include <array>
 #include <basix/mdspan.hpp>
 #include <dolfinx/common/IndexMap.h>
 #include <dolfinx/mesh/Geometry.h>
@@ -50,8 +51,9 @@ T assemble_cells(mdspan2_t x_dofmap,
     for (std::size_t i = 0; i < x_dofs.size(); ++i)
       std::copy_n(&x(x_dofs[i], 0), 3, std::next(cdofs_b.begin(), 3 * i));
 
-    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(), nullptr,
-       nullptr, custom_data.value_or(nullptr));
+    std::int32_t entity_local_index = static_cast<std::int32_t>(index);
+    fn(&value, &coeffs(index, 0), constants.data(), cdofs_b.data(),
+       &entity_local_index, nullptr, custom_data.value_or(nullptr));
   }
 
   return value;
@@ -101,8 +103,10 @@ T assemble_entities(
 
     // Permutations
     std::uint8_t perm = perms.empty() ? 0 : perms(cell, local_entity);
-    fn(&value, &coeffs(f, 0), constants.data(), cdofs_b.data(), &local_entity,
-       &perm, custom_data.value_or(nullptr));
+    std::array<std::int32_t, 2> entity_local_index{
+        local_entity, static_cast<std::int32_t>(f)};
+    fn(&value, &coeffs(f, 0), constants.data(), cdofs_b.data(),
+       entity_local_index.data(), &perm, custom_data.value_or(nullptr));
   }
 
   return value;
@@ -153,8 +157,10 @@ T assemble_interior_facets(
                           ? std::array<std::uint8_t, 2>{0, 0}
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
+    std::array<std::int32_t, 3> entity_local_index{
+        local_facet[0], local_facet[1], static_cast<std::int32_t>(f)};
     fn(&value, &coeffs(f, 0, 0), constants.data(), cdofs_b.data(),
-       local_facet.data(), perm.data(), custom_data.value_or(nullptr));
+       entity_local_index.data(), perm.data(), custom_data.value_or(nullptr));
   }
 
   return value;
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index 8b0ce25b68..841a75f4ca 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -13,6 +13,7 @@
 #include "traits.h"
 #include "utils.h"
 #include <algorithm>
+#include <array>
 #include <basix/mdspan.hpp>
 #include <cstdint>
 #include <dolfinx/common/IndexMap.h>
@@ -102,8 +103,9 @@ void assemble_cells(
 
     // Tabulate vector for cell
     std::ranges::fill(be, 0);
-    kernel(be.data(), &coeffs(index, 0), constants.data(), cdofs.data(), &c,
-           nullptr, custom_data.value_or(nullptr));
+    std::int32_t entity_local_index = static_cast<std::int32_t>(index);
+    kernel(be.data(), &coeffs(index, 0), constants.data(), cdofs.data(),
+           &entity_local_index, nullptr, custom_data.value_or(nullptr));
     P0(be, cell_info0, c0, 1);
 
     // Scatter cell vector to 'global' vector array
@@ -208,8 +210,10 @@ void assemble_entities(
 
     // Tabulate element vector
     std::ranges::fill(be, 0);
+    std::array<std::int32_t, 2> entity_local_index{
+        local_entity, static_cast<std::int32_t>(f)};
     kernel(be.data(), &coeffs(f, 0), constants.data(), cdofs.data(),
-           &local_entity, &perm, custom_data.value_or(nullptr));
+           entity_local_index.data(), &perm, custom_data.value_or(nullptr));
     P0(be, cell_info0, cell0, 1);
 
     // Add element vector to global vector
@@ -328,8 +332,10 @@ void assemble_interior_facets(
                           ? std::array<std::uint8_t, 2>{0, 0}
                           : std::array{perms(cells[0], local_facet[0]),
                                        perms(cells[1], local_facet[1])};
+    std::array<std::int32_t, 3> entity_local_index{
+        local_facet[0], local_facet[1], static_cast<std::int32_t>(f)};
     kernel(be.data(), &coeffs(f, 0, 0), constants.data(), cdofs.data(),
-           local_facet.data(), perm.data(), custom_data.value_or(nullptr));
+           entity_local_index.data(), perm.data(), custom_data.value_or(nullptr));
 
     if (cells0[0] >= 0)
       P0(be, cell_info0, cells0[0], 1);
diff --git a/python/test/unit/fem/test_custom_data.py b/python/test/unit/fem/test_custom_data.py
index 3d77147cc1..09789669dd 100644
--- a/python/test/unit/fem/test_custom_data.py
+++ b/python/test/unit/fem/test_custom_data.py
@@ -17,11 +17,13 @@
     Form,
     IntegralType,
     assemble_matrix,
+    assemble_scalar,
     assemble_vector,
+    compute_integration_domains,
     form_cpp_class,
     functionspace,
 )
-from dolfinx.mesh import create_unit_square
+from dolfinx.mesh import create_unit_square, exterior_facet_indices
 
 numba = pytest.importorskip("numba")
 ufcx_signature = codegen_utils.numba_ufcx_kernel_signature
@@ -95,6 +97,140 @@ def tabulate(A_, w_, c_, coords_, entity_local_index, cell_orientation, custom_d
     return tabulate
 
 
+def tabulate_scalar_with_loop_index(dtype, xdtype, integral_type):
+    """Kernel that accumulates a custom-data value selected by loop index."""
+    voidptr_to_scalar_ptr = voidptr_to_dtype_ptr(dtype)
+
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate(A_, w_, c_, coords_, entity_local_index, orientation, custom_data):
+        A = numba.carray(A_, (1,), dtype=dtype)
+        data = voidptr_to_scalar_ptr(custom_data)
+
+        if integral_type == 0:
+            entity = numba.carray(entity_local_index, (1,), dtype=np.int32)
+            A[0] += data[entity[0]]
+        elif integral_type == 1:
+            entity = numba.carray(entity_local_index, (2,), dtype=np.int32)
+            A[0] += data[entity[1]] + 10 * entity[0]
+        else:
+            entity = numba.carray(entity_local_index, (3,), dtype=np.int32)
+            A[0] += data[entity[2]] + 10 * entity[0] + 100 * entity[1]
+
+    return tabulate
+
+
+def tabulate_vector_with_loop_index(dtype, xdtype, integral_type):
+    """Kernel that writes a custom-data value selected by loop index."""
+    voidptr_to_scalar_ptr = voidptr_to_dtype_ptr(dtype)
+
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate(b_, w_, c_, coords_, entity_local_index, orientation, custom_data):
+        b = numba.carray(b_, (3,), dtype=dtype)
+        data = voidptr_to_scalar_ptr(custom_data)
+        b[:] = 0
+
+        if integral_type == 0:
+            entity = numba.carray(entity_local_index, (1,), dtype=np.int32)
+            b[0] = data[entity[0]]
+        elif integral_type == 1:
+            entity = numba.carray(entity_local_index, (2,), dtype=np.int32)
+            b[0] = data[entity[1]] + 10 * entity[0]
+        else:
+            entity = numba.carray(entity_local_index, (3,), dtype=np.int32)
+            b[0] = data[entity[2]] + 10 * entity[0] + 100 * entity[1]
+
+    return tabulate
+
+
+def tabulate_matrix_with_loop_index(dtype, xdtype, integral_type):
+    """Kernel that writes a custom-data value selected by loop index."""
+    voidptr_to_scalar_ptr = voidptr_to_dtype_ptr(dtype)
+
+    @numba.cfunc(ufcx_signature(dtype, xdtype), nopython=True)
+    def tabulate(A_, w_, c_, coords_, entity_local_index, orientation, custom_data):
+        A = numba.carray(A_, (3, 3), dtype=dtype)
+        data = voidptr_to_scalar_ptr(custom_data)
+        A[:, :] = 0
+
+        if integral_type == 0:
+            entity = numba.carray(entity_local_index, (1,), dtype=np.int32)
+            A[0, 0] = data[entity[0]]
+        elif integral_type == 1:
+            entity = numba.carray(entity_local_index, (2,), dtype=np.int32)
+            A[0, 0] = data[entity[1]] + 10 * entity[0]
+        else:
+            entity = numba.carray(entity_local_index, (3,), dtype=np.int32)
+            A[0, 0] = data[entity[2]] + 10 * entity[0] + 100 * entity[1]
+
+    return tabulate
+
+
+def loop_index_entities(mesh, integral_type):
+    """Return local integration entities for a loop-index test."""
+    tdim = mesh.topology.dim
+    if integral_type == IntegralType.cell:
+        num_cells = mesh.topology.index_map(tdim).size_local
+        return np.arange(num_cells, dtype=np.int32)
+
+    mesh.topology.create_connectivity(tdim - 1, tdim)
+    exterior_facets = exterior_facet_indices(mesh.topology)
+    if integral_type == IntegralType.exterior_facet:
+        return compute_integration_domains(integral_type, mesh.topology, exterior_facets)
+
+    num_facets = mesh.topology.index_map(tdim - 1).size_local
+    facets = np.arange(num_facets, dtype=np.int32)
+    interior_facets = np.setdiff1d(facets, exterior_facets).astype(np.int32)
+    return compute_integration_domains(integral_type, mesh.topology, interior_facets)
+
+
+def expected_loop_index_sum(mesh, integral_type, entities, data):
+    """Return the expected global sum for the loop-index test kernels."""
+    if integral_type == IntegralType.cell:
+        local_value = np.sum(data)
+    elif integral_type == IntegralType.exterior_facet:
+        entity = entities.reshape(-1, 2)
+        local_value = np.sum(data) + 10 * np.sum(entity[:, 1])
+    else:
+        entity = entities.reshape(-1, 4)
+        local_value = np.sum(data) + 10 * np.sum(entity[:, 1]) + 100 * np.sum(entity[:, 3])
+
+    return mesh.comm.allreduce(local_value, op=MPI.SUM)
+
+
+def assemble_loop_index_sum(mesh, V, dtype, integral_type, kernel, entities, data, rank):
+    """Assemble a custom kernel and return the global sum of assembled entries."""
+    active_coeffs = np.array([], dtype=np.int8)
+    integrals = {integral_type: [(0, kernel.address, entities, active_coeffs, data.ctypes.data)]}
+    formtype = form_cpp_class(dtype)
+
+    if rank == 0:
+        F = Form(formtype([], integrals, [], [], False, [], mesh=mesh._cpp_object))
+        local_value = assemble_scalar(F)
+    elif rank == 1:
+        F = Form(formtype([V._cpp_object], integrals, [], [], False, [], mesh=mesh._cpp_object))
+        b = assemble_vector(F)
+        b.scatter_reverse(la.InsertMode.add)
+        num_owned = V.dofmap.index_map.size_local * V.dofmap.index_map_bs
+        local_value = np.sum(b.array[:num_owned])
+    else:
+        F = Form(
+            formtype(
+                [V._cpp_object, V._cpp_object],
+                integrals,
+                [],
+                [],
+                False,
+                [],
+                mesh=mesh._cpp_object,
+            )
+        )
+        A = assemble_matrix(F)
+        A.scatter_reverse()
+        local_value = A.to_scipy().sum()
+
+    return mesh.comm.allreduce(local_value, op=MPI.SUM)
+
+
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
 def test_custom_data_vector_assembly(dtype):
     """Test that custom_data is correctly passed to kernels during vector assembly."""
@@ -191,6 +327,45 @@ def test_custom_data_matrix_assembly(dtype):
     assert np.isclose(norm2, 2.0 * norm1)
 
 
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+@pytest.mark.parametrize(
+    "integral_type, integral_code",
+    [
+        (IntegralType.cell, 0),
+        (IntegralType.exterior_facet, 1),
+        (IntegralType.interior_facet, 2),
+    ],
+)
+@pytest.mark.parametrize("rank", [0, 1, 2])
+def test_custom_data_loop_index_all_assemblers(dtype, integral_type, integral_code, rank):
+    """Test that entity_local_index carries loop indices in all assemblers.
+
+    The test kernels use the loop index to select per-entity custom data. For
+    facet kernels they also check that the existing local facet/entity entries
+    remain available before the appended loop index.
+    """
+    xdtype = np.real(dtype(0)).dtype
+    mesh = create_unit_square(MPI.COMM_WORLD, 4, 3, dtype=xdtype)
+    V = functionspace(mesh, ("Lagrange", 1))
+    entities = loop_index_entities(mesh, integral_type)
+    num_entities = entities.size if integral_type == IntegralType.cell else entities.size // 2
+    if integral_type == IntegralType.interior_facet:
+        num_entities = entities.size // 4
+
+    data = np.arange(1, num_entities + 1, dtype=dtype)
+    expected = expected_loop_index_sum(mesh, integral_type, entities, data)
+
+    if rank == 0:
+        kernel = tabulate_scalar_with_loop_index(dtype, xdtype, integral_code)
+    elif rank == 1:
+        kernel = tabulate_vector_with_loop_index(dtype, xdtype, integral_code)
+    else:
+        kernel = tabulate_matrix_with_loop_index(dtype, xdtype, integral_code)
+
+    actual = assemble_loop_index_sum(mesh, V, dtype, integral_type, kernel, entities, data, rank)
+    assert np.isclose(actual, expected)
+
+
 @pytest.mark.parametrize("dtype", [np.float64, np.float32])
 def test_custom_data_default_nullptr(dtype):
     """Test that custom_data defaults to nullptr (None) when not provided."""