diff --git a/include/utils/tracked_alloc.h b/include/utils/tracked_alloc.h
index adc982d..33b7077 100644
--- a/include/utils/tracked_alloc.h
+++ b/include/utils/tracked_alloc.h
@@ -21,20 +21,60 @@
 #include <stddef.h>
 #include <stdlib.h>
 
-extern size_t g_allocated_bytes;
+/* Platform shim for "how many usable bytes are at this malloc'd pointer".
+    Used to track total live bytes */
+#if defined(__APPLE__)
+#include <malloc/malloc.h>
+#define TRACKED_BLOCK_SIZE(p) malloc_size(p)
+#elif defined(_WIN32) || defined(_WIN64)
+#include <malloc.h>
+#define TRACKED_BLOCK_SIZE(p) _msize(p)
+#else
+#include <malloc.h>
+#define TRACKED_BLOCK_SIZE(p) malloc_usable_size(p)
+#endif
 
-static inline void *SP_MALLOC(size_t size)
+extern size_t g_allocated_bytes; /* current live bytes */
+extern size_t g_peak_bytes;      /* high-water mark since last reset */
+
+/* All allocations in src/ must go through these wrappers (and pair sp_free
+   with sp_malloc / sp_calloc). Tests may use plain malloc/free — those
+   bytes are simply not tracked. */
+static inline void *sp_malloc(size_t size)
 {
     void *ptr = malloc(size);
-    if (ptr) g_allocated_bytes += size;
+    if (ptr)
+    {
+        g_allocated_bytes += TRACKED_BLOCK_SIZE(ptr);
+        if (g_allocated_bytes > g_peak_bytes)
+        {
+            g_peak_bytes = g_allocated_bytes;
+        }
+    }
     return ptr;
 }
 
-static inline void *SP_CALLOC(size_t count, size_t size)
+static inline void *sp_calloc(size_t count, size_t size)
 {
     void *ptr = calloc(count, size);
-    if (ptr) g_allocated_bytes += count * size;
+    if (ptr)
+    {
+        g_allocated_bytes += TRACKED_BLOCK_SIZE(ptr);
+        if (g_allocated_bytes > g_peak_bytes)
+        {
+            g_peak_bytes = g_allocated_bytes;
+        }
+    }
     return ptr;
 }
 
+static inline void sp_free(void *ptr)
+{
+    if (ptr)
+    {
+        g_allocated_bytes -= TRACKED_BLOCK_SIZE(ptr);
+        free(ptr);
+    }
+}
+
 #endif /* TRACKED_ALLOC_H */
diff --git a/src/atoms/affine/add.c b/src/atoms/affine/add.c
index 288bfde..7f434e0 100644
--- a/src/atoms/affine/add.c
+++ b/src/atoms/affine/add.c
@@ -95,7 +95,7 @@ static bool is_affine(const expr *node)
 expr *new_add(expr *left, expr *right)
 {
     assert(left->d1 == right->d1 && left->d2 == right->d2);
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, left->d1, left->d2, left->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
     node->left = left;
diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c
index 213461b..f5792f8 100644
--- a/src/atoms/affine/broadcast.c
+++ b/src/atoms/affine/broadcast.c
@@ -96,7 +96,7 @@ static void wsum_hess_init_impl(expr *node)
     node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* allocate space for weight vector */
-    node->work->dwork = SP_MALLOC(node->size * sizeof(double));
+    node->work->dwork = sp_malloc(node->size * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -176,7 +176,7 @@ expr *new_broadcast(expr *child, int d1, int d2)
         exit(1);
     }
 
-    broadcast_expr *bcast = (broadcast_expr *) SP_CALLOC(1, sizeof(broadcast_expr));
+    broadcast_expr *bcast = (broadcast_expr *) sp_calloc(1, sizeof(broadcast_expr));
     expr *node = (expr *) bcast;
 
     // --------------------------------------------------------------------------
diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c
index 9cdfeb7..1698057 100644
--- a/src/atoms/affine/convolve.c
+++ b/src/atoms/affine/convolve.c
@@ -82,7 +82,8 @@ static void jacobian_init_impl(expr *node)
     conv_matrix_fill_values(cnode->T, a);
 
     /* J = T @ J_child */
-    cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian->to_csr(child->jacobian), node->work->iwork);
+    cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian->to_csr(child->jacobian),
+                                         node->work->iwork);
     node->jacobian =
         new_sparse_matrix(csr_csc_matmul_alloc(cnode->T, cnode->Jchild_CSC));
 }
@@ -95,8 +96,8 @@ static void eval_jacobian(expr *node)
     child->eval_jacobian(child);
 
     /* J = T @ J_child */
-    csr_to_csc_fill_values(child->jacobian->to_csr(child->jacobian), cnode->Jchild_CSC,
-                           node->work->iwork);
+    csr_to_csc_fill_values(child->jacobian->to_csr(child->jacobian),
+                           cnode->Jchild_CSC, node->work->iwork);
     csr_csc_matmul_fill_values(cnode->T, cnode->Jchild_CSC,
                                node->jacobian->to_csr(node->jacobian));
 }
@@ -108,7 +109,7 @@ static void wsum_hess_init_impl(expr *node)
 
     wsum_hess_init(child);
     node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess);
-    node->work->dwork = (double *) SP_MALLOC(cnode->n * sizeof(double));
+    node->work->dwork = (double *) sp_malloc(cnode->n * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -174,7 +175,7 @@ expr *new_convolve(expr *param_node, expr *child)
         exit(1);
     }
 
-    convolve_expr *cnode = (convolve_expr *) SP_CALLOC(1, sizeof(convolve_expr));
+    convolve_expr *cnode = (convolve_expr *) sp_calloc(1, sizeof(convolve_expr));
     expr *node = &cnode->base;
     init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess,
@@ -188,7 +189,7 @@ expr *new_convolve(expr *param_node, expr *child)
     cnode->n = n;
 
     /* iwork is used for csr_to_csc of child's Jacobian (size n_vars). */
-    node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int));
+    node->work->iwork = (int *) sp_malloc(node->n_vars * sizeof(int));
 
     /* Ensure first forward() pulls current param values through any
        broadcast/promote wrappers and reflects them in T (once T is built). */
diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c
index 03fbd45..d5d74a0 100644
--- a/src/atoms/affine/diag_vec.c
+++ b/src/atoms/affine/diag_vec.c
@@ -70,7 +70,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* workspace for extracting diagonal weights */
-    node->work->dwork = (double *) SP_CALLOC(x->size, sizeof(double));
+    node->work->dwork = (double *) sp_calloc(x->size, sizeof(double));
 
     /* Copy child's Hessian structure (diag_vec is linear, so its own Hessian is
      * zero) */
@@ -106,7 +106,7 @@ expr *new_diag_vec(expr *child)
 
     /* n is the number of elements (works for both row and column vectors) */
     int n = child->size;
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, n, n, child->n_vars, forward, jacobian_init_impl, eval_jacobian,
               is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
     node->left = child;
diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c
index a34f8a4..5bbe8fe 100644
--- a/src/atoms/affine/hstack.c
+++ b/src/atoms/affine/hstack.c
@@ -187,14 +187,14 @@ expr *new_hstack(expr **args, int n_args, int n_vars)
     }
 
     /* Allocate the type-specific struct */
-    hstack_expr *hnode = (hstack_expr *) SP_CALLOC(1, sizeof(hstack_expr));
+    hstack_expr *hnode = (hstack_expr *) sp_calloc(1, sizeof(hstack_expr));
     expr *node = &hnode->base;
     init_expr(node, args[0]->d1, d2, n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, wsum_hess_eval,
               free_type_data);
 
     /* Set type-specific fields (deep copy args array) */
-    hnode->args = (expr **) SP_CALLOC(n_args, sizeof(expr *));
+    hnode->args = (expr **) sp_calloc(n_args, sizeof(expr *));
     hnode->n_args = n_args;
     for (int i = 0; i < n_args; i++)
     {
diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c
index 3fd070e..c41f79c 100644
--- a/src/atoms/affine/index.c
+++ b/src/atoms/affine/index.c
@@ -29,7 +29,7 @@
  * Returns true if duplicates exist, false otherwise. */
 static bool check_for_duplicates(const int *indices, int n_idxs, int max_idx)
 {
-    bool *seen = (bool *) SP_CALLOC(max_idx, sizeof(bool));
+    bool *seen = (bool *) sp_calloc(max_idx, sizeof(bool));
     bool has_dup = false;
     for (int i = 0; i < n_idxs && !has_dup; i++)
     {
@@ -89,7 +89,7 @@ static void wsum_hess_init_impl(expr *node)
     wsum_hess_init(x);
 
     /* for setting weight vector to evaluate hessian of child */
-    node->work->dwork = (double *) SP_CALLOC(x->size, sizeof(double));
+    node->work->dwork = (double *) sp_calloc(x->size, sizeof(double));
 
     /* in the implementation of eval_wsum_hess we evaluate the
        child's hessian with a weight vector that has w[i] = 0
@@ -148,7 +148,7 @@ expr *new_index(expr *child, int d1, int d2, const int *indices, int n_idxs)
 {
     assert(d1 * d2 == n_idxs);
     /* allocate type-specific struct */
-    index_expr *idx = (index_expr *) SP_CALLOC(1, sizeof(index_expr));
+    index_expr *idx = (index_expr *) sp_calloc(1, sizeof(index_expr));
     expr *node = &idx->base;
 
     init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl,
@@ -159,7 +159,7 @@ expr *new_index(expr *child, int d1, int d2, const int *indices, int n_idxs)
     expr_retain(child);
 
     /* copy indices */
-    idx->indices = (int *) SP_MALLOC(n_idxs * sizeof(int));
+    idx->indices = (int *) sp_malloc(n_idxs * sizeof(int));
     memcpy(idx->indices, indices, n_idxs * sizeof(int));
     idx->n_idxs = n_idxs;
 
diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c
index e3e0550..1336ddc 100644
--- a/src/atoms/affine/left_matmul.c
+++ b/src/atoms/affine/left_matmul.c
@@ -201,7 +201,7 @@ static void wsum_hess_init_impl(expr *node)
     /* work for computing A^T w*/
     int n_blocks = ((left_matmul_expr *) node)->n_blocks;
     int dim = ((left_matmul_expr *) node)->AT->m * n_blocks;
-    node->work->dwork = (double *) SP_MALLOC(dim * sizeof(double));
+    node->work->dwork = (double *) sp_malloc(dim * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -263,7 +263,7 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A)
 
     /* Allocate the type-specific struct */
     left_matmul_expr *lnode =
-        (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr));
+        (left_matmul_expr *) sp_calloc(1, sizeof(left_matmul_expr));
     expr *node = &lnode->base;
     /* Sparse A — always the general CSC-mirror path. */
     init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_sparse,
@@ -276,8 +276,8 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A)
        (requiring size node->n_vars).
        csc_to_csr_work is used for converting J_CSC to CSR_matrix (requiring
        node->size) */
-    node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int));
-    lnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int));
+    node->work->iwork = (int *) sp_malloc(node->n_vars * sizeof(int));
+    lnode->csc_to_csr_work = (int *) sp_malloc(node->size * sizeof(int));
     lnode->n_blocks = n_blocks;
 
     /* store A and AT. new_sparse_matrix takes ownership, so clone first. */
@@ -304,7 +304,7 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n,
     compute_dims(u, m, n, &d1, &d2, &n_blocks);
 
     left_matmul_expr *lnode =
-        (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr));
+        (left_matmul_expr *) sp_calloc(1, sizeof(left_matmul_expr));
     expr *node = &lnode->base;
     init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_dense,
               eval_jacobian_dense, is_affine, wsum_hess_init_impl, eval_wsum_hess,
diff --git a/src/atoms/affine/neg.c b/src/atoms/affine/neg.c
index 01bbf5e..a42c305 100644
--- a/src/atoms/affine/neg.c
+++ b/src/atoms/affine/neg.c
@@ -85,7 +85,7 @@ static bool is_affine(const expr *node)
 
 expr *new_neg(expr *child)
 {
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, child->d1, child->d2, child->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
     node->left = child;
diff --git a/src/atoms/affine/parameter.c b/src/atoms/affine/parameter.c
index 0b9ffbe..a757edc 100644
--- a/src/atoms/affine/parameter.c
+++ b/src/atoms/affine/parameter.c
@@ -61,7 +61,7 @@ static bool is_affine(const expr *node)
 
 expr *new_parameter(int d1, int d2, int param_id, int n_vars, const double *values)
 {
-    parameter_expr *pnode = (parameter_expr *) SP_CALLOC(1, sizeof(parameter_expr));
+    parameter_expr *pnode = (parameter_expr *) sp_calloc(1, sizeof(parameter_expr));
     expr *node = &pnode->base;
     init_expr(node, d1, d2, n_vars, forward, jacobian_init_impl, eval_jacobian,
               is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
diff --git a/src/atoms/affine/promote.c b/src/atoms/affine/promote.c
index 1a031d8..529f846 100644
--- a/src/atoms/affine/promote.c
+++ b/src/atoms/affine/promote.c
@@ -84,7 +84,7 @@ static bool is_affine(const expr *node)
 expr *new_promote(expr *child, int d1, int d2)
 {
     assert(child->size == 1);
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
     node->left = child;
diff --git a/src/atoms/affine/reshape.c b/src/atoms/affine/reshape.c
index e59a7ec..5517112 100644
--- a/src/atoms/affine/reshape.c
+++ b/src/atoms/affine/reshape.c
@@ -69,7 +69,7 @@ static bool is_affine(const expr *node)
 expr *new_reshape(expr *child, int d1, int d2)
 {
     assert(d1 * d2 == child->size);
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
     node->left = child;
diff --git a/src/atoms/affine/right_matmul.c b/src/atoms/affine/right_matmul.c
index 5e3b39f..f76db91 100644
--- a/src/atoms/affine/right_matmul.c
+++ b/src/atoms/affine/right_matmul.c
@@ -58,7 +58,7 @@ expr *new_right_matmul(expr *param_node, expr *u, const CSR_matrix *A)
 {
     /* We can express right matmul using left matmul and transpose:
        u @ A = (A^T @ u^T)^T. */
-    int *work_transpose = (int *) SP_MALLOC(A->n * sizeof(int));
+    int *work_transpose = (int *) sp_malloc(A->n * sizeof(int));
     CSR_matrix *AT = transpose(A, work_transpose);
 
     expr *u_transpose = new_transpose(u);
@@ -104,7 +104,7 @@ expr *new_right_matmul_dense(expr *param_node, expr *u, int m, int n,
     }
     else
     {
-        double *AT = (double *) SP_MALLOC(n * m * sizeof(double));
+        double *AT = (double *) sp_malloc(n * m * sizeof(double));
         A_transpose(AT, data, m, n);
         left_matmul_node = new_left_matmul_dense(NULL, u_transpose, n, m, AT);
         free(AT);
diff --git a/src/atoms/affine/scalar_mult.c b/src/atoms/affine/scalar_mult.c
index 433f3e1..dbf8455 100644
--- a/src/atoms/affine/scalar_mult.c
+++ b/src/atoms/affine/scalar_mult.c
@@ -118,7 +118,7 @@ static void free_type_data(expr *node)
 expr *new_scalar_mult(expr *param_node, expr *child)
 {
     scalar_mult_expr *mult_node =
-        (scalar_mult_expr *) SP_CALLOC(1, sizeof(scalar_mult_expr));
+        (scalar_mult_expr *) sp_calloc(1, sizeof(scalar_mult_expr));
     expr *node = &mult_node->base;
 
     init_expr(node, child->d1, child->d2, child->n_vars, forward, jacobian_init_impl,
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 5de6b80..f042d11 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -93,8 +93,8 @@ static void jacobian_init_impl(expr *node)
 
     /* we never have to store more than the child's nnz */
     CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, Jx->nnz);
-    node->work->iwork = SP_MALLOC(MAX(jac->n, Jx->nnz) * sizeof(int));
-    snode->idx_map = SP_MALLOC(Jx->nnz * sizeof(int));
+    node->work->iwork = sp_malloc(MAX(jac->n, Jx->nnz) * sizeof(int));
+    snode->idx_map = sp_malloc(Jx->nnz * sizeof(int));
 
     /* the idx_map array maps each nonzero entry j in x->jacobian
        to the corresponding index in the output row matrix C. Specifically, for
@@ -150,7 +150,7 @@ static void wsum_hess_init_impl(expr *node)
 
     /* we never have to store more than the child's nnz */
     node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess);
-    node->work->dwork = SP_MALLOC(child->size * sizeof(double));
+    node->work->dwork = sp_malloc(child->size * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -210,7 +210,7 @@ expr *new_sum(expr *child, int axis)
     }
 
     /* Allocate the type-specific struct */
-    sum_expr *snode = (sum_expr *) SP_CALLOC(1, sizeof(sum_expr));
+    sum_expr *snode = (sum_expr *) sp_calloc(1, sizeof(sum_expr));
     expr *node = &snode->base;
 
     /* to be consistent with CVXPY and NumPy we treat the result from
diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c
index 846a45b..2dca9a5 100644
--- a/src/atoms/affine/trace.c
+++ b/src/atoms/affine/trace.c
@@ -72,14 +72,14 @@ static void jacobian_init_impl(expr *node)
     // fill sparsity pattern and idx_map
     // ---------------------------------------------------------------
     trace_expr *tnode = (trace_expr *) node;
-    node->work->iwork = SP_MALLOC(MAX(jac->n, total_nnz) * sizeof(int));
+    node->work->iwork = sp_malloc(MAX(jac->n, total_nnz) * sizeof(int));
 
     /* the idx_map array maps each nonzero entry j in the original matrix A (from the
        selected, evenly spaced rows) to the corresponding index in the output row
        matrix C. Specifically, for each nonzero entry j in A (from the selected
        rows), idx_map[j] gives the position in C->x where the value from A->x[j]
        should be accumulated. */
-    tnode->idx_map = SP_MALLOC(A->nnz * sizeof(int));
+    tnode->idx_map = sp_malloc(A->nnz * sizeof(int));
     sum_spaced_rows_into_row_csr_alloc(A, jac, row_spacing, node->work->iwork,
                                        tnode->idx_map);
     node->jacobian = new_sparse_matrix(jac);
@@ -107,7 +107,7 @@ static void wsum_hess_init_impl(expr *node)
     /* initialize child's hessian */
     wsum_hess_init(x);
 
-    node->work->dwork = (double *) SP_CALLOC(x->size, sizeof(double));
+    node->work->dwork = (double *) sp_calloc(x->size, sizeof(double));
 
     /* We copy over the sparsity pattern from the child. This also includes the
        contribution to wsum_hess of entries of the child that will always have
@@ -148,7 +148,7 @@ static void free_type_data(expr *node)
 
 expr *new_trace(expr *child)
 {
-    trace_expr *tnode = (trace_expr *) SP_CALLOC(1, sizeof(trace_expr));
+    trace_expr *tnode = (trace_expr *) sp_calloc(1, sizeof(trace_expr));
     expr *node = &tnode->base;
     init_expr(node, 1, 1, child->n_vars, forward, jacobian_init_impl, eval_jacobian,
               is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data);
diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c
index 59c961e..4760376 100644
--- a/src/atoms/affine/transpose.c
+++ b/src/atoms/affine/transpose.c
@@ -44,7 +44,7 @@ static void jacobian_init_impl(expr *node)
 
     /* The transpose's Jacobian is a row permutation of the child's:
        J_node[r, :] = J_child[k(r), :] where k(r) = (r/d1) + (r%d1)*d2. */
-    int *indices = (int *) SP_MALLOC(n_out * sizeof(int));
+    int *indices = (int *) sp_malloc(n_out * sizeof(int));
     for (int r = 0; r < n_out; r++)
     {
         indices[r] = (r / d1) + (r % d1) * d2;
@@ -74,7 +74,7 @@ static void wsum_hess_init_impl(expr *node)
     node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* for computing Kw where K is the commutation matrix */
-    node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
+    node->work->dwork = (double *) sp_malloc(node->size * sizeof(double));
 }
 static void eval_wsum_hess(expr *node, const double *w)
 {
@@ -104,7 +104,7 @@ static bool is_affine(const expr *node)
 
 expr *new_transpose(expr *child)
 {
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, child->d2, child->d1, child->n_vars, forward, jacobian_init_impl,
               eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
     node->left = child;
diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c
index fe010e7..3291829 100644
--- a/src/atoms/affine/variable.c
+++ b/src/atoms/affine/variable.c
@@ -66,7 +66,7 @@ static bool is_affine(const expr *node)
 
 expr *new_variable(int d1, int d2, int var_id, int n_vars)
 {
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, d1, d2, n_vars, forward, jacobian_init_impl, eval_jacobian,
               is_affine, wsum_hess_init_impl, wsum_hess_eval, NULL);
     node->var_id = var_id;
diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c
index ee7992b..6a3b2cf 100644
--- a/src/atoms/affine/vector_mult.c
+++ b/src/atoms/affine/vector_mult.c
@@ -85,7 +85,7 @@ static void wsum_hess_init_impl(expr *node)
     node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
     /* workspace for storing scaled weights */
-    node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
+    node->work->dwork = (double *) sp_malloc(node->size * sizeof(double));
 }
 
 static void eval_wsum_hess(expr *node, const double *w)
@@ -124,7 +124,7 @@ static bool is_affine(const expr *node)
 expr *new_vector_mult(expr *param_node, expr *child)
 {
     vector_mult_expr *vnode =
-        (vector_mult_expr *) SP_CALLOC(1, sizeof(vector_mult_expr));
+        (vector_mult_expr *) sp_calloc(1, sizeof(vector_mult_expr));
     expr *node = &vnode->base;
 
     init_expr(node, child->d1, child->d2, child->n_vars, forward, jacobian_init_impl,
diff --git a/src/atoms/affine/vstack.c b/src/atoms/affine/vstack.c
index 18e1376..87ecb9b 100644
--- a/src/atoms/affine/vstack.c
+++ b/src/atoms/affine/vstack.c
@@ -35,7 +35,7 @@ expr *new_vstack(expr **args, int n_args, int n_vars)
         assert(args[i]->d2 == args[0]->d2);
     }
 
-    expr **transposed = (expr **) SP_MALLOC(n_args * sizeof(expr *));
+    expr **transposed = (expr **) sp_malloc(n_args * sizeof(expr *));
     for (int i = 0; i < n_args; i++)
     {
         transposed[i] = new_transpose(args[i]);
diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c
index 4180327..f75d1e1 100644
--- a/src/atoms/bivariate_full_dom/matmul.c
+++ b/src/atoms/bivariate_full_dom/matmul.c
@@ -433,12 +433,12 @@ static void wsum_hess_init_chain_rule(expr *node)
     mnode->B = build_cross_hessian_sparsity(m, k, n);
     mnode->BJg = csr_csc_matmul_alloc(mnode->B, Jg);
     int max_alloc = MAX(mnode->BJg->m, mnode->BJg->n);
-    mnode->BJg_csc_work = (int *) SP_MALLOC(max_alloc * sizeof(int));
+    mnode->BJg_csc_work = (int *) sp_malloc(max_alloc * sizeof(int));
     mnode->BJg_CSC = csr_to_csc_alloc(mnode->BJg, mnode->BJg_csc_work);
     mnode->C = BTA_alloc(mnode->BJg_CSC, Jf);
 
     /* initialize C^T */
-    node->work->iwork = (int *) SP_MALLOC(mnode->C->m * sizeof(int));
+    node->work->iwork = (int *) sp_malloc(mnode->C->m * sizeof(int));
     mnode->CT = AT_alloc(mnode->C, node->work->iwork);
 
     /* initialize Hessians of children */
@@ -475,7 +475,7 @@ static void wsum_hess_init_chain_rule(expr *node)
     if (!f->is_affine(f) || !g->is_affine(g))
     {
         node->work->dwork =
-            (double *) SP_MALLOC(MAX(f->size, g->size) * sizeof(double));
+            (double *) sp_malloc(MAX(f->size, g->size) * sizeof(double));
     }
 }
 
@@ -560,7 +560,7 @@ expr *new_matmul(expr *x, expr *y)
     }
 
     /* Allocate the expression node */
-    expr *node = (expr *) SP_CALLOC(1, sizeof(matmul_expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(matmul_expr));
 
     /* Choose no-chain-rule or chain-rule function pointers */
     bool use_chain_rule = !(x->var_id != NOT_A_VARIABLE &&
diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c
index 325f9a4..93c8e2e 100644
--- a/src/atoms/bivariate_full_dom/multiply.c
+++ b/src/atoms/bivariate_full_dom/multiply.c
@@ -149,7 +149,7 @@ static void wsum_hess_init_impl(expr *node)
         /* used for computing weights to wsum_hess of children */
         if (!x->is_affine(x) || !y->is_affine(y))
         {
-            node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
+            node->work->dwork = (double *) sp_malloc(node->size * sizeof(double));
         }
 
         /* For sparse matrices we need the CSC cache to be valid for the
@@ -318,7 +318,7 @@ static bool is_affine(const expr *node)
 expr *new_elementwise_mult(expr *left, expr *right)
 {
     elementwise_mult_expr *mul_node =
-        (elementwise_mult_expr *) SP_CALLOC(1, sizeof(elementwise_mult_expr));
+        (elementwise_mult_expr *) sp_calloc(1, sizeof(elementwise_mult_expr));
     expr *node = &mul_node->base;
 
     init_expr(node, left->d1, left->d2, left->n_vars, forward, jacobian_init_impl,
diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
index 12be5a1..4a49843 100644
--- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c
+++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c
@@ -84,10 +84,10 @@ static void jacobian_init_impl(expr *node)
     }
     else /* left node is not a variable (guaranteed to be a linear operator) */
     {
-        node->work->dwork = (double *) SP_MALLOC(x->size * sizeof(double));
+        node->work->dwork = (double *) sp_malloc(x->size * sizeof(double));
 
         /* compute required allocation and allocate jacobian */
-        bool *col_nz = (bool *) SP_CALLOC(
+        bool *col_nz = (bool *) sp_calloc(
             node->n_vars, sizeof(bool)); /* TODO: could use iwork here instead*/
         CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
         int nonzero_cols = count_nonzero_cols(Jx, col_nz);
@@ -114,7 +114,7 @@ static void jacobian_init_impl(expr *node)
         jac->p[1] = jac->nnz;
 
         /* find position where y should be inserted */
-        node->work->iwork = (int *) SP_MALLOC(sizeof(int));
+        node->work->iwork = (int *) sp_malloc(sizeof(int));
         for (int j = 0; j < jac->nnz; j++)
         {
             if (jac->i[j] == y->var_id)
@@ -340,7 +340,7 @@ expr *new_quad_over_lin(expr *left, expr *right)
         exit(EXIT_FAILURE);
     }
 
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, 1, 1, left->n_vars, forward, jacobian_init_impl, eval_jacobian,
               is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL);
     node->left = left;
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c
index ecd4520..3dca836 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr.c
@@ -206,7 +206,7 @@ expr *new_rel_entr_vector_args(expr *left, expr *right)
         exit(EXIT_FAILURE);
     }
 
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, left->d1, left->d2, left->n_vars, forward_vector_args,
               jacobian_init_vectors_args, eval_jacobian_vector_args, is_affine,
               wsum_hess_init_vector_args, eval_wsum_hess_vector_args, NULL);
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
index a356289..7216063 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c
@@ -105,8 +105,7 @@ static void wsum_hess_init_scalar_vector(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    CSR_matrix *H =
-        new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
+    CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
@@ -220,7 +219,7 @@ static bool is_affine(const expr *node)
 expr *new_rel_entr_first_arg_scalar(expr *left, expr *right)
 {
     assert(left->d1 == 1 && left->d2 == 1);
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, right->d1, right->d2, left->n_vars, forward_scalar_vector,
               jacobian_init_scalar_vector, eval_jacobian_scalar_vector, is_affine,
               wsum_hess_init_scalar_vector, eval_wsum_hess_scalar_vector, NULL);
diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
index f2b7928..12177c6 100644
--- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
+++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c
@@ -105,8 +105,7 @@ static void wsum_hess_init_vector_scalar(expr *node)
     int var_id_x = x->var_id;
     int var_id_y = y->var_id;
 
-    CSR_matrix *H =
-        new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
+    CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1);
 
     if (var_id_x < var_id_y)
     {
@@ -221,7 +220,7 @@ static bool is_affine(const expr *node)
 expr *new_rel_entr_second_arg_scalar(expr *left, expr *right)
 {
     assert(right->d1 == 1 && right->d2 == 1);
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     init_expr(node, left->d1, left->d2, left->n_vars, forward_vector_scalar,
               jacobian_init_vector_scalar, eval_jacobian_vector_scalar, is_affine,
               wsum_hess_init_vector_scalar, eval_wsum_hess_vector_scalar, NULL);
diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c
index 352f784..c834b73 100644
--- a/src/atoms/elementwise_full_dom/common.c
+++ b/src/atoms/elementwise_full_dom/common.c
@@ -47,9 +47,9 @@ void jacobian_init_elementwise(expr *node)
         /* jacobian of h(x) = f(g(x)) is Jf @ Jg, and here Jf is diagonal */
         jacobian_init(child);
         node->jacobian = child->jacobian->copy_sparsity(child->jacobian);
-        node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double));
+        node->work->dwork = (double *) sp_malloc(node->size * sizeof(double));
         node->work->local_jac_diag =
-            (double *) SP_MALLOC(node->size * sizeof(double));
+            (double *) sp_malloc(node->size * sizeof(double));
     }
 }
 
@@ -118,8 +118,7 @@ void wsum_hess_init_elementwise(expr *node)
                 child->wsum_hess->copy_sparsity(child->wsum_hess);
 
             /* wsum_hess = term1 + term2 */
-            int max_nnz =
-                node->work->hess_term1->nnz + node->work->hess_term2->nnz;
+            int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz;
             node->wsum_hess =
                 new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz);
             sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2,
@@ -175,8 +174,8 @@ void eval_wsum_hess_elementwise(expr *node, const double *w)
                    child->wsum_hess->nnz * sizeof(double));
 
             /* wsum_hess = term1 + term2 */
-            sum_matrices_fill_values(node->work->hess_term1,
-                                     node->work->hess_term2, node->wsum_hess);
+            sum_matrices_fill_values(node->work->hess_term1, node->work->hess_term2,
+                                     node->wsum_hess);
         }
     }
 }
@@ -202,7 +201,7 @@ void init_elementwise(expr *node, expr *child)
 
 expr *new_elementwise(expr *child)
 {
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     if (!node) return NULL;
 
     init_elementwise(node, child);
diff --git a/src/atoms/elementwise_full_dom/power.c b/src/atoms/elementwise_full_dom/power.c
index cde9c34..bdd227c 100644
--- a/src/atoms/elementwise_full_dom/power.c
+++ b/src/atoms/elementwise_full_dom/power.c
@@ -61,7 +61,7 @@ static void local_wsum_hess(expr *node, double *out, const double *w)
 expr *new_power(expr *child, double p)
 {
     /* Allocate the type-specific struct */
-    power_expr *pnode = (power_expr *) SP_CALLOC(1, sizeof(power_expr));
+    power_expr *pnode = (power_expr *) sp_calloc(1, sizeof(power_expr));
     expr *node = &pnode->base;
     init_elementwise(node, child);
     node->forward = forward;
diff --git a/src/atoms/elementwise_restricted_dom/common.c b/src/atoms/elementwise_restricted_dom/common.c
index 5c88733..bd90c95 100644
--- a/src/atoms/elementwise_restricted_dom/common.c
+++ b/src/atoms/elementwise_restricted_dom/common.c
@@ -63,7 +63,7 @@ bool is_affine_restricted(const expr *node)
 
 expr *new_restricted(expr *child)
 {
-    expr *node = (expr *) SP_CALLOC(1, sizeof(expr));
+    expr *node = (expr *) sp_calloc(1, sizeof(expr));
     if (!node) return NULL;
 
     init_expr(node, child->d1, child->d2, child->n_vars, NULL,
diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c
index aeb7226..3feab4a 100644
--- a/src/atoms/other/prod.c
+++ b/src/atoms/other/prod.c
@@ -217,7 +217,7 @@ I think they return row vectors.*/
 expr *new_prod(expr *child)
 {
     /* Output is scalar: 1 x 1 */
-    prod_expr *pnode = (prod_expr *) SP_CALLOC(1, sizeof(prod_expr));
+    prod_expr *pnode = (prod_expr *) sp_calloc(1, sizeof(prod_expr));
     expr *node = &pnode->base;
     init_expr(node, 1, 1, child->n_vars, forward, jacobian_init_impl, eval_jacobian,
               is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data);
diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c
index 2e0e971..5ef7cc6 100644
--- a/src/atoms/other/prod_axis_one.c
+++ b/src/atoms/other/prod_axis_one.c
@@ -398,7 +398,7 @@ static void free_type_data(expr *node)
 
 expr *new_prod_axis_one(expr *child)
 {
-    prod_axis *pnode = (prod_axis *) SP_CALLOC(1, sizeof(prod_axis));
+    prod_axis *pnode = (prod_axis *) sp_calloc(1, sizeof(prod_axis));
     expr *node = &pnode->base;
 
     /* output is always a row vector 1 x d1 (one product per row) */
@@ -407,9 +407,9 @@ expr *new_prod_axis_one(expr *child)
               free_type_data);
 
     /* allocate arrays to store per-row statistics */
-    pnode->num_of_zeros = (int *) SP_CALLOC(child->d1, sizeof(int));
-    pnode->zero_index = (int *) SP_CALLOC(child->d1, sizeof(int));
-    pnode->prod_nonzero = (double *) SP_CALLOC(child->d1, sizeof(double));
+    pnode->num_of_zeros = (int *) sp_calloc(child->d1, sizeof(int));
+    pnode->zero_index = (int *) sp_calloc(child->d1, sizeof(int));
+    pnode->prod_nonzero = (double *) sp_calloc(child->d1, sizeof(double));
 
     node->left = child;
     expr_retain(child);
diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c
index 55bedd2..b1f51df 100644
--- a/src/atoms/other/prod_axis_zero.c
+++ b/src/atoms/other/prod_axis_zero.c
@@ -358,7 +358,7 @@ static void free_type_data(expr *node)
 /* TODO: refactor to remove diagonal entry as nonzero since it's always zero */
 expr *new_prod_axis_zero(expr *child)
 {
-    prod_axis *pnode = (prod_axis *) SP_CALLOC(1, sizeof(prod_axis));
+    prod_axis *pnode = (prod_axis *) sp_calloc(1, sizeof(prod_axis));
     expr *node = &pnode->base;
 
     /* output is always a row vector 1 x d2 - TODO: is that correct? */
@@ -367,9 +367,9 @@ expr *new_prod_axis_zero(expr *child)
               free_type_data);
 
     /* allocate arrays to store per-column statistics */
-    pnode->num_of_zeros = (int *) SP_CALLOC(child->d2, sizeof(int));
-    pnode->zero_index = (int *) SP_CALLOC(child->d2, sizeof(int));
-    pnode->prod_nonzero = (double *) SP_CALLOC(child->d2, sizeof(double));
+    pnode->num_of_zeros = (int *) sp_calloc(child->d2, sizeof(int));
+    pnode->zero_index = (int *) sp_calloc(child->d2, sizeof(int));
+    pnode->prod_nonzero = (double *) sp_calloc(child->d2, sizeof(double));
 
     node->left = child;
     expr_retain(child);
diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c
index d49c16b..d02e52c 100644
--- a/src/atoms/other/quad_form.c
+++ b/src/atoms/other/quad_form.c
@@ -18,8 +18,8 @@
 #include "atoms/non_elementwise_full_dom.h"
 #include "subexpr.h"
 #include "utils/CSC_matrix.h"
-#include "utils/matrix_sum.h"
 #include "utils/cblas_wrapper.h"
+#include "utils/matrix_sum.h"
 #include "utils/sparse_matrix.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
@@ -107,8 +107,8 @@ static void eval_jacobian(expr *node)
 
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), x->work->jacobian_csc,
-                                   x->work->csc_work);
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian),
+                                   x->work->jacobian_csc, x->work->csc_work);
 
             if (x->is_affine(x))
             {
@@ -173,8 +173,7 @@ static void wsum_hess_init_impl(expr *node)
         node->work->hess_term2 = x->wsum_hess->copy_sparsity(x->wsum_hess);
 
         /* hess = term1 + term2 */
-        int max_nnz =
-            node->work->hess_term1->nnz + node->work->hess_term2->nnz;
+        int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz;
         node->wsum_hess =
             new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz);
         sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2,
@@ -201,7 +200,8 @@ static void eval_wsum_hess(expr *node, const double *w)
         CSC_matrix *Jf = x->work->jacobian_csc;
         if (!x->work->jacobian_csc_filled)
         {
-            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jf, x->work->csc_work);
+            csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jf,
+                                   x->work->csc_work);
 
             if (x->is_affine(x))
             {
@@ -222,8 +222,10 @@ static void eval_wsum_hess(expr *node, const double *w)
                x->wsum_hess->nnz * sizeof(double));
 
         /* scale both terms by 2w */
-        cblas_dscal(node->work->hess_term1->nnz, two_w, node->work->hess_term1->x, 1);
-        cblas_dscal(node->work->hess_term2->nnz, two_w, node->work->hess_term2->x, 1);
+        cblas_dscal(node->work->hess_term1->nnz, two_w, node->work->hess_term1->x,
+                    1);
+        cblas_dscal(node->work->hess_term2->nnz, two_w, node->work->hess_term2->x,
+                    1);
 
         /* sum the two terms */
         sum_matrices_fill_values(node->work->hess_term1, node->work->hess_term2,
@@ -253,7 +255,7 @@ static bool is_affine(const expr *node)
 expr *new_quad_form(expr *left, CSR_matrix *Q)
 {
     assert(left->d1 == 1 || left->d2 == 1); /* left must be a vector */
-    quad_form_expr *qnode = (quad_form_expr *) SP_CALLOC(1, sizeof(quad_form_expr));
+    quad_form_expr *qnode = (quad_form_expr *) sp_calloc(1, sizeof(quad_form_expr));
     expr *node = &qnode->base;
 
     init_expr(node, 1, 1, left->n_vars, forward, jacobian_init_impl, eval_jacobian,
@@ -266,6 +268,6 @@ expr *new_quad_form(expr *left, CSR_matrix *Q)
     copy_CSR_matrix(Q, qnode->Q);
 
     /* dwork stores the result of Q @ f(x) in the forward pass */
-    node->work->dwork = (double *) SP_MALLOC(left->size * sizeof(double));
+    node->work->dwork = (double *) sp_malloc(left->size * sizeof(double));
     return node;
 }
diff --git a/src/expr.c b/src/expr.c
index b99b266..03212c2 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -32,7 +32,7 @@ void init_expr(expr *node, int d1, int d2, int n_vars, forward_fn forward,
     node->size = d1 * d2;
     node->n_vars = n_vars;
     node->refcount = 0;
-    node->value = (double *) SP_CALLOC(d1 * d2, sizeof(double));
+    node->value = (double *) sp_calloc(d1 * d2, sizeof(double));
     node->var_id = NOT_A_VARIABLE;
     node->forward = forward;
     node->jacobian_init_impl = jacobian_init;
@@ -41,7 +41,7 @@ void init_expr(expr *node, int d1, int d2, int n_vars, forward_fn forward,
     node->wsum_hess_init_impl = wsum_hess_init;
     node->eval_wsum_hess = eval_wsum_hess;
     node->free_type_data = free_type_data;
-    node->work = (Expr_Work *) SP_CALLOC(1, sizeof(Expr_Work));
+    node->work = (Expr_Work *) sp_calloc(1, sizeof(Expr_Work));
 }
 
 void jacobian_csc_init(expr *node)
@@ -50,9 +50,9 @@ void jacobian_csc_init(expr *node)
     {
         return;
     }
-    node->work->csc_work = (int *) SP_MALLOC(node->n_vars * sizeof(int));
-    node->work->jacobian_csc =
-        csr_to_csc_alloc(node->jacobian->to_csr(node->jacobian), node->work->csc_work);
+    node->work->csc_work = (int *) sp_malloc(node->n_vars * sizeof(int));
+    node->work->jacobian_csc = csr_to_csc_alloc(
+        node->jacobian->to_csr(node->jacobian), node->work->csc_work);
 }
 
 void free_expr(expr *node)
diff --git a/src/old-code/linear_op.c b/src/old-code/linear_op.c
index 8637ee6..b66a8d9 100644
--- a/src/old-code/linear_op.c
+++ b/src/old-code/linear_op.c
@@ -91,7 +91,7 @@ expr *new_linear(expr *u, const CSR_matrix *A, const double *b)
     assert(u->d2 == 1);
     /* Allocate the type-specific struct */
     linear_op_expr *lin_node =
-        (linear_op_expr *) SP_CALLOC(1, sizeof(linear_op_expr));
+        (linear_op_expr *) sp_calloc(1, sizeof(linear_op_expr));
     expr *node = &lin_node->base;
     init_expr(node, A->m, 1, u->n_vars, forward, jacobian_init_impl, eval_jacobian,
               is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data);
@@ -106,7 +106,7 @@ expr *new_linear(expr *u, const CSR_matrix *A, const double *b)
     /* Initialize offset (copy b if provided, otherwise NULL) */
     if (b != NULL)
     {
-        lin_node->b = (double *) SP_MALLOC(A->m * sizeof(double));
+        lin_node->b = (double *) sp_malloc(A->m * sizeof(double));
         memcpy(lin_node->b, b, A->m * sizeof(double));
     }
     else
diff --git a/src/old-code/old_permuted_dense.c b/src/old-code/old_permuted_dense.c
index d32dc07..2f3deb1 100644
--- a/src/old-code/old_permuted_dense.c
+++ b/src/old-code/old_permuted_dense.c
@@ -30,7 +30,7 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A)
     /* Gather the union of columns appearing in A's rows at positions
        row_perm_B. Use a bitmap of size A->n for O(nnz) collection. */
     int p = A->n;
-    char *seen = (char *) SP_CALLOC(p, sizeof(char));
+    char *seen = (char *) sp_calloc(p, sizeof(char));
     int s_A = 0;
     for (int kk = 0; kk < B->m0; kk++)
     {
@@ -46,7 +46,7 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A)
         }
     }
 
-    int *col_active = (int *) SP_MALLOC((s_A > 0 ? s_A : 1) * sizeof(int));
+    int *col_active = (int *) sp_malloc((s_A > 0 ? s_A : 1) * sizeof(int));
     int idx = 0;
     for (int j = 0; j < p; j++)
     {
@@ -71,7 +71,7 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A)
     {
         free(C_pd->kernel_dwork);
         C_pd->kernel_dwork_size = gather_size;
-        C_pd->kernel_dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
+        C_pd->kernel_dwork = (double *) sp_calloc(gather_size, sizeof(double));
     }
     return C;
 }
@@ -163,7 +163,7 @@ matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
     /* Gather the union of columns appearing in B's rows at positions
        row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */
     int q = B_csr->n;
-    char *seen = (char *) SP_CALLOC(q, sizeof(char));
+    char *seen = (char *) sp_calloc(q, sizeof(char));
     int r_B = 0;
     for (int kk = 0; kk < A->m0; kk++)
     {
@@ -179,7 +179,7 @@ matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
         }
     }
 
-    int *row_active = (int *) SP_MALLOC((r_B > 0 ? r_B : 1) * sizeof(int));
+    int *row_active = (int *) sp_malloc((r_B > 0 ? r_B : 1) * sizeof(int));
     int idx = 0;
     for (int i = 0; i < q; i++)
     {
@@ -203,7 +203,7 @@ matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A)
     {
         free(C_pd->kernel_dwork);
         C_pd->kernel_dwork_size = gather_size;
-        C_pd->kernel_dwork = (double *) SP_CALLOC(gather_size, sizeof(double));
+        C_pd->kernel_dwork = (double *) sp_calloc(gather_size, sizeof(double));
     }
     return C;
 }
diff --git a/src/problem.c b/src/problem.c
index 4a6bab9..c2ec84a 100644
--- a/src/problem.c
+++ b/src/problem.c
@@ -32,8 +32,11 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork);
 problem *new_problem(expr *objective, expr **constraints, int n_constraints,
                      bool verbose)
 {
-    g_allocated_bytes = 0;
-    problem *prob = (problem *) SP_CALLOC(1, sizeof(problem));
+    /* we don't reset g_peak_bytes or g_allocated_bytes since allocations
+       using sp_malloc/sp_calloc might have happened before new_problem in eg.,
+       left_matmul, and their frees will subtract from this counter. */
+    g_peak_bytes = g_allocated_bytes;
+    problem *prob = (problem *) sp_calloc(1, sizeof(problem));
     if (!prob) return NULL;
 
     /* objective */
@@ -47,7 +50,7 @@ problem *new_problem(expr *objective, expr **constraints, int n_constraints,
     prob->n_constraints = n_constraints;
     if (n_constraints > 0)
     {
-        prob->constraints = (expr **) SP_MALLOC(n_constraints * sizeof(expr *));
+        prob->constraints = (expr **) sp_malloc(n_constraints * sizeof(expr *));
         for (int i = 0; i < n_constraints; i++)
         {
             prob->constraints[i] = constraints[i];
@@ -58,8 +61,8 @@ problem *new_problem(expr *objective, expr **constraints, int n_constraints,
 
     /* allocation */
     prob->constraint_values =
-        (double *) SP_CALLOC(prob->total_constraint_size, sizeof(double));
-    prob->gradient_values = (double *) SP_CALLOC(prob->n_vars, sizeof(double));
+        (double *) sp_calloc(prob->total_constraint_size, sizeof(double));
+    prob->gradient_values = (double *) sp_calloc(prob->n_vars, sizeof(double));
 
     /* Initialize statistics */
     prob->stats.time_init_derivatives = 0.0;
@@ -251,8 +254,8 @@ void problem_init_hessian(problem *prob)
     prob->lagrange_hessian = new_CSR_matrix(prob->n_vars, prob->n_vars, nnz);
     memset(prob->lagrange_hessian->x, 0, nnz * sizeof(double)); /* affine shortcut */
     prob->stats.nnz_hessian = nnz;
-    prob->hess_idx_map = (int *) SP_MALLOC(nnz * sizeof(int));
-    int *iwork = (int *) SP_MALLOC(MAX(nnz, prob->n_vars) * sizeof(int));
+    prob->hess_idx_map = (int *) sp_malloc(nnz * sizeof(int));
+    int *iwork = (int *) sp_malloc(MAX(nnz, prob->n_vars) * sizeof(int));
     problem_lagrange_hess_fill_sparsity(prob, iwork);
     free(iwork);
 
@@ -321,7 +324,7 @@ static inline void print_end_message(const Diff_engine_stats *stats)
     printf("  Lagrange Hessian (nnz):                 %d\n", stats->nnz_hessian);
     char mem_buf[64];
     format_memory(stats->memory_bytes, mem_buf, sizeof(mem_buf));
-    printf("  Allocated memory:                       %s\n", mem_buf);
+    printf("  Peak memory:                            %s\n", mem_buf);
 
     printf("\nTiming (seconds):\n");
     printf("  Derivative structure (sparsity):     %8.3f\n",
@@ -349,7 +352,7 @@ void free_problem(problem *prob)
 
     if (prob->verbose)
     {
-        prob->stats.memory_bytes = g_allocated_bytes;
+        prob->stats.memory_bytes = g_peak_bytes;
         print_end_message(&prob->stats);
     }
 
@@ -380,7 +383,7 @@ void free_problem(problem *prob)
 void problem_register_params(problem *prob, expr **param_nodes, int n_param_nodes)
 {
     prob->n_param_nodes = n_param_nodes;
-    prob->param_nodes = (expr **) SP_MALLOC(n_param_nodes * sizeof(expr *));
+    prob->param_nodes = (expr **) sp_malloc(n_param_nodes * sizeof(expr *));
     memcpy(prob->param_nodes, param_nodes, n_param_nodes * sizeof(expr *));
 
     prob->total_parameter_size = 0;
diff --git a/src/utils/COO_matrix.c b/src/utils/COO_matrix.c
index 8602c09..0cfe2c4 100644
--- a/src/utils/COO_matrix.c
+++ b/src/utils/COO_matrix.c
@@ -22,13 +22,13 @@
 
 COO_matrix *new_COO_matrix(const CSR_matrix *A)
 {
-    COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix));
+    COO_matrix *coo = (COO_matrix *) sp_malloc(sizeof(COO_matrix));
     coo->m = A->m;
     coo->n = A->n;
     coo->nnz = A->nnz;
-    coo->rows = (int *) SP_MALLOC(A->nnz * sizeof(int));
-    coo->cols = (int *) SP_MALLOC(A->nnz * sizeof(int));
-    coo->x = (double *) SP_MALLOC(A->nnz * sizeof(double));
+    coo->rows = (int *) sp_malloc(A->nnz * sizeof(int));
+    coo->cols = (int *) sp_malloc(A->nnz * sizeof(int));
+    coo->x = (double *) sp_malloc(A->nnz * sizeof(double));
     coo->value_map = NULL;
 
     for (int r = 0; r < A->m; r++)
@@ -60,14 +60,14 @@ COO_matrix *new_COO_matrix_lower_triangular(const CSR_matrix *A)
         }
     }
 
-    COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix));
+    COO_matrix *coo = (COO_matrix *) sp_malloc(sizeof(COO_matrix));
     coo->m = A->m;
     coo->n = A->n;
     coo->nnz = count;
-    coo->rows = (int *) SP_MALLOC(count * sizeof(int));
-    coo->cols = (int *) SP_MALLOC(count * sizeof(int));
-    coo->x = (double *) SP_MALLOC(count * sizeof(double));
-    coo->value_map = (int *) SP_MALLOC(count * sizeof(int));
+    coo->rows = (int *) sp_malloc(count * sizeof(int));
+    coo->cols = (int *) sp_malloc(count * sizeof(int));
+    coo->x = (double *) sp_malloc(count * sizeof(double));
+    coo->value_map = (int *) sp_malloc(count * sizeof(int));
 
     /* Pass 2: fill arrays */
     int idx = 0;
diff --git a/src/utils/CSC_matrix.c b/src/utils/CSC_matrix.c
index 08eec98..5fc5aa1 100644
--- a/src/utils/CSC_matrix.c
+++ b/src/utils/CSC_matrix.c
@@ -24,12 +24,12 @@
 
 CSC_matrix *new_CSC_matrix(int m, int n, int nnz)
 {
-    CSC_matrix *matrix = (CSC_matrix *) SP_MALLOC(sizeof(CSC_matrix));
+    CSC_matrix *matrix = (CSC_matrix *) sp_malloc(sizeof(CSC_matrix));
     if (!matrix) return NULL;
 
-    matrix->p = (int *) SP_MALLOC((n + 1) * sizeof(int));
-    matrix->i = (int *) SP_MALLOC(nnz * sizeof(int));
-    matrix->x = (double *) SP_MALLOC(nnz * sizeof(double));
+    matrix->p = (int *) sp_malloc((n + 1) * sizeof(int));
+    matrix->i = (int *) sp_malloc(nnz * sizeof(int));
+    matrix->x = (double *) sp_malloc(nnz * sizeof(double));
 
     if (!matrix->p || !matrix->i || !matrix->x)
     {
@@ -67,7 +67,7 @@ CSR_matrix *ATA_alloc(const CSC_matrix *A)
     int i, j, ii, jj;
 
     /* row ptr and column idxs for upper triangular part of C = A^T A */
-    int *Cp = (int *) SP_MALLOC((n + 1) * sizeof(int));
+    int *Cp = (int *) sp_malloc((n + 1) * sizeof(int));
     iVec *Ci = iVec_new(m);
     Cp[0] = 0;
 
@@ -341,7 +341,7 @@ CSR_matrix *BTA_alloc(const CSC_matrix *A, const CSC_matrix *B)
     int i, j, ii, jj;
 
     /* row ptr and column idxs for C = B^T A */
-    int *Cp = (int *) SP_MALLOC((p + 1) * sizeof(int));
+    int *Cp = (int *) sp_malloc((p + 1) * sizeof(int));
     iVec *Ci = iVec_new(n);
     Cp[0] = 0;
 
@@ -487,13 +487,13 @@ CSC_matrix *symBA_alloc(const CSR_matrix *B, const CSC_matrix *A)
     int i, j, k, jj, ii, ell;
 
     /* marker[row] = last column j that registered row as nonzero */
-    int *marker = (int *) SP_MALLOC(m * sizeof(int));
+    int *marker = (int *) sp_malloc(m * sizeof(int));
     for (i = 0; i < m; i++)
     {
         marker[i] = -1;
     }
 
-    int *Cp = (int *) SP_MALLOC((n + 1) * sizeof(int));
+    int *Cp = (int *) sp_malloc((n + 1) * sizeof(int));
     iVec *Ci = iVec_new(A->nnz);
     Cp[0] = 0;
 
diff --git a/src/utils/CSR_matrix.c b/src/utils/CSR_matrix.c
index 1123068..132e58c 100644
--- a/src/utils/CSR_matrix.c
+++ b/src/utils/CSR_matrix.c
@@ -27,10 +27,10 @@
 
 CSR_matrix *new_CSR_matrix(int m, int n, int nnz)
 {
-    CSR_matrix *matrix = (CSR_matrix *) SP_MALLOC(sizeof(CSR_matrix));
-    matrix->p = (int *) SP_CALLOC(m + 1, sizeof(int));
-    matrix->i = (int *) SP_CALLOC(nnz, sizeof(int));
-    matrix->x = (double *) SP_MALLOC(nnz * sizeof(double));
+    CSR_matrix *matrix = (CSR_matrix *) sp_malloc(sizeof(CSR_matrix));
+    matrix->p = (int *) sp_calloc(m + 1, sizeof(int));
+    matrix->i = (int *) sp_calloc(nnz, sizeof(int));
+    matrix->x = (double *) sp_malloc(nnz * sizeof(double));
     matrix->m = m;
     matrix->n = n;
     matrix->nnz = nnz;
@@ -258,7 +258,7 @@ void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_matrix *C)
     int i, j, col;
 
     /* Count entries per row */
-    int *counts = (int *) SP_CALLOC(m, sizeof(int));
+    int *counts = (int *) sp_calloc(m, sizeof(int));
     for (i = 0; i < m; i++)
     {
         for (j = Ap[i]; j < Ap[i + 1]; j++)
diff --git a/src/utils/CSR_sum.c b/src/utils/CSR_sum.c
index 286fd38..b30ef8d 100644
--- a/src/utils/CSR_sum.c
+++ b/src/utils/CSR_sum.c
@@ -373,7 +373,7 @@ CSR_matrix *sum_4_csr_alloc(const CSR_matrix *A, const CSR_matrix *B,
     CSR_matrix *out = new_CSR_matrix(m, n, nnz_ub);
     for (int k = 0; k < 4; k++)
     {
-        idx_maps[k] = (int *) SP_MALLOC(inputs[k]->nnz * sizeof(int));
+        idx_maps[k] = (int *) sp_malloc(inputs[k]->nnz * sizeof(int));
     }
 
     /* 4-way sorted merge per row */
diff --git a/src/utils/int_double_pair.c b/src/utils/int_double_pair.c
index d03096a..8fe149d 100644
--- a/src/utils/int_double_pair.c
+++ b/src/utils/int_double_pair.c
@@ -31,7 +31,7 @@ static int compare_int_double_pair(const void *a, const void *b)
 
 int_double_pair *new_int_double_pair_array(int size)
 {
-    return (int_double_pair *) SP_MALLOC(size * sizeof(int_double_pair));
+    return (int_double_pair *) sp_malloc(size * sizeof(int_double_pair));
 }
 
 void set_int_double_pair_array(int_double_pair *pair, int *ints, double *doubles,
diff --git a/src/utils/linalg_dense_sparse_matmuls.c b/src/utils/linalg_dense_sparse_matmuls.c
index 22f133d..6a66841 100644
--- a/src/utils/linalg_dense_sparse_matmuls.c
+++ b/src/utils/linalg_dense_sparse_matmuls.c
@@ -15,11 +15,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "utils/linalg_dense_sparse_matmuls.h"
 #include "utils/CSC_matrix.h"
 #include "utils/CSR_matrix.h"
 #include "utils/cblas_wrapper.h"
 #include "utils/iVec.h"
-#include "utils/linalg_dense_sparse_matmuls.h"
 #include "utils/tracked_alloc.h"
 #include <assert.h>
 #include <stdlib.h>
@@ -35,7 +35,7 @@ CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p)
     int n = A->n;
     int i, j, jj, block, block_start, block_end, block_jj_start, row_offset;
 
-    int *Cp = (int *) SP_MALLOC((J->n + 1) * sizeof(int));
+    int *Cp = (int *) sp_malloc((J->n + 1) * sizeof(int));
     iVec *Ci = iVec_new(J->n * m);
     Cp[0] = 0;
 
@@ -147,8 +147,8 @@ void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C,
                     j_dense[J->i[s] - block_start] = J->x[s];
                 }
 
-                cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A->x, n,
-                            j_dense, 1, 0.0, C->x + i, 1);
+                cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A->x, n, j_dense,
+                            1, 0.0, C->x + i, 1);
             }
         }
     }
@@ -174,7 +174,7 @@ CSR_matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_matrix *J)
     // ---------------------------------------------------------------
     //           build sparsity pattern per blk_row
     // ---------------------------------------------------------------
-    iVec **pattern = (iVec **) SP_MALLOC(m * sizeof(iVec *));
+    iVec **pattern = (iVec **) sp_malloc(m * sizeof(iVec *));
     total_nnz = 0;
     for (blk_row = 0; blk_row < m; blk_row++)
     {
@@ -262,7 +262,7 @@ CSR_matrix *I_kron_X_alloc(int m, int k, int n, const CSC_matrix *J)
      *         nonzero in row range [blk*k, blk*k + k). */
     int i, j, ii, blk;
 
-    iVec **pattern = (iVec **) SP_MALLOC(n * sizeof(iVec *));
+    iVec **pattern = (iVec **) sp_malloc(n * sizeof(iVec *));
     int total_nnz = 0;
     for (blk = 0; blk < n; blk++)
     {
diff --git a/src/utils/linalg_sparse_matmuls.c b/src/utils/linalg_sparse_matmuls.c
index 7720f70..9ca83c5 100644
--- a/src/utils/linalg_sparse_matmuls.c
+++ b/src/utils/linalg_sparse_matmuls.c
@@ -96,7 +96,7 @@ CSC_matrix *block_left_multiply_fill_sparsity(const CSR_matrix *A,
         row_offset;
 
     /* allocate column pointers and an estimate of row indices */
-    int *Cp = (int *) SP_MALLOC((J->n + 1) * sizeof(int));
+    int *Cp = (int *) sp_malloc((J->n + 1) * sizeof(int));
     iVec *Ci = iVec_new(J->n * m);
     Cp[0] = 0;
 
@@ -250,8 +250,8 @@ void csr_csc_matmul_fill_values(const CSR_matrix *A, const CSC_matrix *B,
     }
 }
 
-/* C = A @ B where A is CSR_matrix (m x n), B is CSC_matrix (n x p). Result C is CSR_matrix (m x p)
-  with precomputed sparsity pattern */
+/* C = A @ B where A is CSR_matrix (m x n), B is CSC_matrix (n x p). Result C is
+  CSR_matrix (m x p) with precomputed sparsity pattern */
 CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B)
 {
     int m = A->m;
@@ -259,7 +259,7 @@ CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B)
 
     int len_a, len_b;
 
-    int *Cp = (int *) SP_MALLOC((m + 1) * sizeof(int));
+    int *Cp = (int *) sp_malloc((m + 1) * sizeof(int));
     iVec *Ci = iVec_new(m);
 
     Cp[0] = 0;
diff --git a/src/utils/matrix_sum.c b/src/utils/matrix_sum.c
index c8b12b5..3bfc65a 100644
--- a/src/utils/matrix_sum.c
+++ b/src/utils/matrix_sum.c
@@ -33,6 +33,6 @@ void sum_matrices_fill_values(matrix *A, matrix *B, matrix *C)
 void sum_scaled_matrices_fill_values(matrix *A, matrix *B, matrix *C,
                                      const double *d1, const double *d2)
 {
-    sum_scaled_csr_matrices_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C),
-                                        d1, d2);
+    sum_scaled_csr_matrices_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C), d1,
+                                        d2);
 }
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index def3d19..eb058b9 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -111,7 +111,7 @@ static void permuted_dense_vtable_transpose_fill_values(const matrix *self,
 matrix *index_pd_alloc(const permuted_dense *A, const int *indices, int n_idxs)
 {
     /* Scan indices: which output positions i hit a row in A->row_perm? */
-    int *new_row_perm = (int *) SP_MALLOC(n_idxs * sizeof(int));
+    int *new_row_perm = (int *) sp_malloc(n_idxs * sizeof(int));
     int new_m0 = 0;
     for (int i = 0; i < n_idxs; i++)
     {
@@ -164,7 +164,7 @@ matrix *promote_pd_alloc(const permuted_dense *A, int size)
                                   NULL);
     }
 
-    int *new_row_perm = (int *) SP_MALLOC(size * sizeof(int));
+    int *new_row_perm = (int *) sp_malloc(size * sizeof(int));
     for (int i = 0; i < size; i++)
     {
         new_row_perm[i] = i;
@@ -220,7 +220,7 @@ matrix *broadcast_pd_alloc(const permuted_dense *A, broadcast_type type, int d1,
                                   NULL);
     }
 
-    int *new_row_perm = (int *) SP_MALLOC(new_m0 * sizeof(int));
+    int *new_row_perm = (int *) sp_malloc(new_m0 * sizeof(int));
     int k = 0;
     if (type == BROADCAST_SCALAR)
     {
@@ -319,7 +319,7 @@ matrix *diag_vec_pd_alloc(const permuted_dense *A)
                                   NULL);
     }
 
-    int *new_row_perm = (int *) SP_MALLOC(A->m0 * sizeof(int));
+    int *new_row_perm = (int *) sp_malloc(A->m0 * sizeof(int));
     for (int ii = 0; ii < A->m0; ii++)
     {
         new_row_perm[ii] = A->row_perm[ii] * (n + 1);
@@ -444,7 +444,7 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
         assert(col_perm[0] >= 0 && col_perm[n0 - 1] < n);
     }
 
-    permuted_dense *pd = (permuted_dense *) SP_CALLOC(1, sizeof(permuted_dense));
+    permuted_dense *pd = (permuted_dense *) sp_calloc(1, sizeof(permuted_dense));
     pd->base.m = m;
     pd->base.n = n;
     pd->base.nnz = m0 * n0;
@@ -454,13 +454,13 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
     pd->n0 = n0;
 
     int sz = m0 * n0;
-    pd->row_perm = (int *) SP_MALLOC(m0 * sizeof(int));
-    pd->col_perm = (int *) SP_MALLOC(n0 * sizeof(int));
-    pd->X = (double *) SP_MALLOC(sz * sizeof(double));
+    pd->row_perm = (int *) sp_malloc(m0 * sizeof(int));
+    pd->col_perm = (int *) sp_malloc(n0 * sizeof(int));
+    pd->X = (double *) sp_malloc(sz * sizeof(double));
     pd->base.x = pd->X;
     pd->owns_X = true;
-    pd->col_inv = (int *) SP_MALLOC(n * sizeof(int));
-    pd->row_inv = (int *) SP_MALLOC(m * sizeof(int));
+    pd->col_inv = (int *) sp_malloc(n * sizeof(int));
+    pd->row_inv = (int *) sp_malloc(m * sizeof(int));
 
     if (m0 > 0)
     {
@@ -499,8 +499,8 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm,
 
 matrix *new_permuted_dense_full(int m, int n, const double *data)
 {
-    int *row_perm = (int *) SP_MALLOC(m * sizeof(int));
-    int *col_perm = (int *) SP_MALLOC(n * sizeof(int));
+    int *row_perm = (int *) sp_malloc(m * sizeof(int));
+    int *col_perm = (int *) sp_malloc(n * sizeof(int));
     for (int i = 0; i < m; i++) row_perm[i] = i;
     for (int j = 0; j < n; j++) col_perm[j] = j;
     matrix *out = new_permuted_dense(m, n, m, n, row_perm, col_perm, data);
@@ -548,6 +548,6 @@ void permuted_dense_ensure_kernel_dwork(const permuted_dense *pd_const, size_t s
     permuted_dense *pd = (permuted_dense *) pd_const;
     if (pd->kernel_dwork_size >= size) return;
     free(pd->kernel_dwork);
-    pd->kernel_dwork = (double *) SP_MALLOC(size * sizeof(double));
+    pd->kernel_dwork = (double *) sp_malloc(size * sizeof(double));
     pd->kernel_dwork_size = size;
 }
diff --git a/src/utils/permuted_dense_linalg.c b/src/utils/permuted_dense_linalg.c
index ca89f61..dc493e7 100644
--- a/src/utils/permuted_dense_linalg.c
+++ b/src/utils/permuted_dense_linalg.c
@@ -127,7 +127,7 @@ matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
        in iwork, hence 2 * s_max). */
     permuted_dense *C_pd = (permuted_dense *) C;
     C_pd->kernel_iwork_size = (size_t) 2 * s_max;
-    C_pd->kernel_iwork = (int *) SP_MALLOC(C_pd->kernel_iwork_size * sizeof(int));
+    C_pd->kernel_iwork = (int *) sp_malloc(C_pd->kernel_iwork_size * sizeof(int));
 
     return C;
 }
@@ -328,7 +328,7 @@ matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A)
        same idiom as BTA_pd_pd_alloc. */
     permuted_dense *C_pd = (permuted_dense *) C;
     C_pd->kernel_iwork_size = (size_t) 2 * s_max;
-    C_pd->kernel_iwork = (int *) SP_MALLOC(C_pd->kernel_iwork_size * sizeof(int));
+    C_pd->kernel_iwork = (int *) sp_malloc(C_pd->kernel_iwork_size * sizeof(int));
 
     return C;
 }
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 211ac31..91b2467 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -65,7 +65,7 @@ matrix *new_sparse_matrix(CSR_matrix *A);
 void sparse_matrix_ensure_csc_cache(sparse_matrix *sm)
 {
     if (sm->csc_cache != NULL) return;
-    sm->csc_iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
+    sm->csc_iwork = (int *) sp_malloc(sm->csr->n * sizeof(int));
     sm->csc_cache = csr_to_csc_alloc(sm->csr, sm->csc_iwork);
 }
 
@@ -105,7 +105,7 @@ static CSR_matrix *sparse_to_csr(matrix *self)
 static matrix *sparse_transpose_alloc(const matrix *self)
 {
     const sparse_matrix *sm = (const sparse_matrix *) self;
-    int *iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int));
+    int *iwork = (int *) sp_malloc(sm->csr->n * sizeof(int));
     CSR_matrix *AT = AT_alloc(sm->csr, iwork);
     sparse_matrix *out = (sparse_matrix *) new_sparse_matrix(AT);
     out->transpose_iwork = iwork;
@@ -301,7 +301,8 @@ static void sparse_diag_vec_fill_values(matrix *self, matrix *out)
     }
 }
 
-/* Build CSC_matrix structure on first call; refill values from csr->x on every call. */
+/* Build CSC_matrix structure on first call; refill values from csr->x on every call.
+ */
 static void sparse_refresh_csc_values(matrix *self)
 {
     sparse_matrix *sm = (sparse_matrix *) self;
@@ -335,7 +336,7 @@ static void wire_vtable(sparse_matrix *sm)
 
 matrix *new_sparse_matrix(CSR_matrix *A)
 {
-    sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix));
+    sparse_matrix *sm = (sparse_matrix *) sp_calloc(1, sizeof(sparse_matrix));
     sm->base.m = A->m;
     sm->base.n = A->n;
     sm->base.nnz = A->nnz;
@@ -353,7 +354,7 @@ matrix *new_sparse_matrix_alloc(int m, int n, int nnz)
 matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork)
 {
     CSR_matrix *AT = transpose(self->csr, iwork);
-    sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix));
+    sparse_matrix *sm = (sparse_matrix *) sp_calloc(1, sizeof(sparse_matrix));
     sm->base.m = AT->m;
     sm->base.n = AT->n;
     sm->base.nnz = AT->nnz;
diff --git a/src/utils/stacked_pd.c b/src/utils/stacked_pd.c
index aacaaf0..83d87ab 100644
--- a/src/utils/stacked_pd.c
+++ b/src/utils/stacked_pd.c
@@ -157,8 +157,8 @@ matrix *spd_map_filter_blocks(const stacked_pd *B, int Cm, int Cn, spd_block_op
     }
 
     permuted_dense **C_blocks =
-        (permuted_dense **) SP_MALLOC(B->n_blocks * sizeof(permuted_dense *));
-    int *C_src = (int *) SP_MALLOC(B->n_blocks * sizeof(int));
+        (permuted_dense **) sp_malloc(B->n_blocks * sizeof(permuted_dense *));
+    int *C_src = (int *) sp_malloc(B->n_blocks * sizeof(int));
 
     int out_nb = 0;
     for (int k = 0; k < B->n_blocks; k++)
@@ -176,7 +176,7 @@ matrix *spd_map_filter_blocks(const stacked_pd *B, int Cm, int Cn, spd_block_op
         }
     }
 
-    int *C_src_p = (int *) SP_MALLOC((out_nb + 1) * sizeof(int));
+    int *C_src_p = (int *) sp_malloc((out_nb + 1) * sizeof(int));
     for (int k = 0; k <= out_nb; k++)
     {
         C_src_p[k] = k;
@@ -198,7 +198,7 @@ void compose_csr_idx_map_for_spd(const stacked_pd *spd, const CSR_matrix *csr,
        col_perm straight into csr->i, the n0 CSR entries for any row are
        exactly that block's columns in order — CSR position is just
        csr->p[row] + jj. */
-    int *scratch = (int *) SP_MALLOC(spd->base.nnz * sizeof(int));
+    int *scratch = (int *) sp_malloc(spd->base.nnz * sizeof(int));
     int native = 0;
     for (int k = 0; k < spd->n_blocks; k++)
     {
@@ -283,7 +283,7 @@ static matrix *stacked_pd_vtable_diag_vec_alloc(matrix *self)
 {
     stacked_pd *A = (stacked_pd *) self;
     permuted_dense **tmp_blocks =
-        (permuted_dense **) SP_MALLOC(A->n_blocks * sizeof(permuted_dense *));
+        (permuted_dense **) sp_malloc(A->n_blocks * sizeof(permuted_dense *));
     for (int k = 0; k < A->n_blocks; k++)
     {
         tmp_blocks[k] = (permuted_dense *) diag_vec_pd_alloc(A->blocks[k]);
@@ -409,7 +409,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl
     // --------------------------------------------------------------------------------
     //                          Set up basic fields
     // --------------------------------------------------------------------------------
-    stacked_pd *spd = (stacked_pd *) SP_CALLOC(1, sizeof(stacked_pd));
+    stacked_pd *spd = (stacked_pd *) sp_calloc(1, sizeof(stacked_pd));
     spd->base.m = m;
     spd->base.n = n;
     int nnz = 0;
@@ -421,8 +421,8 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl
     wire_vtable(spd);
 
     spd->n_blocks = n_blocks;
-    spd->blocks = (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *));
-    spd->src_block_idx_p = (int *) SP_MALLOC((n_blocks + 1) * sizeof(int));
+    spd->blocks = (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *));
+    spd->src_block_idx_p = (int *) sp_malloc((n_blocks + 1) * sizeof(int));
     spd->src_block_idx_p[0] = 0;
     if (n_blocks > 0)
     {
@@ -435,7 +435,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl
     if (src_block_idx_p == NULL)
     {
         /* identity: each output block has itself as its one source */
-        spd->src_block_idx = (int *) SP_MALLOC(n_blocks * sizeof(int));
+        spd->src_block_idx = (int *) sp_malloc(n_blocks * sizeof(int));
         for (int k = 0; k < n_blocks; k++)
         {
             spd->src_block_idx_p[k + 1] = k + 1;
@@ -446,7 +446,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl
     {
         int total = src_block_idx_p[n_blocks];
         memcpy(spd->src_block_idx_p, src_block_idx_p, (n_blocks + 1) * sizeof(int));
-        spd->src_block_idx = (int *) SP_MALLOC(total * sizeof(int));
+        spd->src_block_idx = (int *) sp_malloc(total * sizeof(int));
         if (total > 0)
         {
             memcpy(spd->src_block_idx, src_block_idx, total * sizeof(int));
@@ -456,7 +456,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl
     // ---------------------------------------------------------------------------
     // Absorb each block's X into a single shared values buffer owned by this spd.
     // ----------------------------------------------------------------------------
-    spd->base.x = (double *) SP_MALLOC(spd->base.nnz * sizeof(double));
+    spd->base.x = (double *) sp_malloc(spd->base.nnz * sizeof(double));
     int offset = 0;
     for (int k = 0; k < n_blocks; k++)
     {
diff --git a/src/utils/stacked_pd_coalesce.c b/src/utils/stacked_pd_coalesce.c
index f123a45..99c5d31 100644
--- a/src/utils/stacked_pd_coalesce.c
+++ b/src/utils/stacked_pd_coalesce.c
@@ -39,7 +39,7 @@ typedef struct
 
 static int_csr *int_csr_create(int *p, int *data, int n)
 {
-    int_csr *csr = (int_csr *) SP_MALLOC(sizeof(int_csr));
+    int_csr *csr = (int_csr *) sp_malloc(sizeof(int_csr));
     csr->p = p;
     csr->data = data;
     csr->n = n;
@@ -118,7 +118,7 @@ static int_csr *build_row_to_blocks_csr(const stacked_pd *A, int *scratch)
     int m = A->base.m;
     int n_blocks = A->n_blocks;
 
-    int *p = (int *) SP_CALLOC(m + 1, sizeof(int));
+    int *p = (int *) sp_calloc(m + 1, sizeof(int));
     for (int k = 0; k < n_blocks; k++)
     {
         permuted_dense *Ak = A->blocks[k];
@@ -129,7 +129,7 @@ static int_csr *build_row_to_blocks_csr(const stacked_pd *A, int *scratch)
     }
     cumsum(p, m);
 
-    int *data = (int *) SP_MALLOC(p[m] * sizeof(int));
+    int *data = (int *) sp_malloc(p[m] * sizeof(int));
     int *blocks_added_per_row = scratch;
     for (int k = 0; k < n_blocks; k++)
     {
@@ -148,7 +148,7 @@ static void catalog_signatures(const int_csr *row_to_blocks, iVec *catalog_sig_p
                                iVec *catalog_sig_data, int **row_to_sig_out)
 {
     int m = row_to_blocks->n;
-    int *row_to_sig = (int *) SP_MALLOC(m * sizeof(int));
+    int *row_to_sig = (int *) sp_malloc(m * sizeof(int));
     for (int i = 0; i < m; i++)
     {
         int signature_len;
@@ -169,7 +169,7 @@ static void catalog_signatures(const int_csr *row_to_blocks, iVec *catalog_sig_p
 static int_csr *group_rows_by_signature(int m, int n_sigs, const int *row_to_sig,
                                         int *scratch)
 {
-    int *p = (int *) SP_CALLOC(n_sigs + 1, sizeof(int));
+    int *p = (int *) sp_calloc(n_sigs + 1, sizeof(int));
     for (int i = 0; i < m; i++)
     {
         if (row_to_sig[i] >= 0)
@@ -179,7 +179,7 @@ static int_csr *group_rows_by_signature(int m, int n_sigs, const int *row_to_sig
     }
     cumsum(p, n_sigs);
 
-    int *data = (int *) SP_MALLOC(p[n_sigs] * sizeof(int));
+    int *data = (int *) sp_malloc(p[n_sigs] * sizeof(int));
     int *rows_added_per_sig = scratch;
     for (int i = 0; i < m; i++)
     {
@@ -208,13 +208,13 @@ static permuted_dense **build_output_blocks(const stacked_pd *A,
     int n = A->base.n;
     int n_sigs = catalog_sig_p->len - 1;
 
-    permuted_dense **out_blocks = (permuted_dense **) SP_MALLOC(
+    permuted_dense **out_blocks = (permuted_dense **) sp_malloc(
         (n_sigs > 0 ? n_sigs : 1) * sizeof(permuted_dense *));
     iVec *col_union = iVec_new(8);
     /* col_arrs / col_lens: scratch passed to sorted_union_int_arrays.
        Max sig size is bounded by A->n_blocks, so size once outside the loop. */
-    const int **col_arrs = (const int **) SP_MALLOC(A->n_blocks * sizeof(int *));
-    int *col_lens = (int *) SP_MALLOC(A->n_blocks * sizeof(int));
+    const int **col_arrs = (const int **) sp_malloc(A->n_blocks * sizeof(int *));
+    int *col_lens = (int *) sp_malloc(A->n_blocks * sizeof(int));
 
     for (int s = 0; s < n_sigs; s++)
     {
@@ -247,7 +247,7 @@ static matrix *coalesce_spd_alloc_impl(const stacked_pd *A,
 {
     int m = A->base.m;
     int n = A->base.n;
-    int *iwork = (int *) SP_CALLOC(m, sizeof(int));
+    int *iwork = (int *) sp_calloc(m, sizeof(int));
 
     // ---------------------------------------------------------------------------
     // For each global row in A, catalog which blocks contain it.
diff --git a/src/utils/stacked_pd_kron_linalg.c b/src/utils/stacked_pd_kron_linalg.c
index cd8b2de..679b7ce 100644
--- a/src/utils/stacked_pd_kron_linalg.c
+++ b/src/utils/stacked_pd_kron_linalg.c
@@ -56,8 +56,8 @@ static permuted_dense *kron_scratch_init(const permuted_dense *A, int p)
     int n = A->n0;
 
     /* Initial block-0 perms. */
-    int *row_perm = (int *) SP_MALLOC(m * sizeof(int));
-    int *col_perm = (int *) SP_MALLOC(n * sizeof(int));
+    int *row_perm = (int *) sp_malloc(m * sizeof(int));
+    int *col_perm = (int *) sp_malloc(n * sizeof(int));
     for (int i = 0; i < m; i++) row_perm[i] = i;
     for (int j = 0; j < n; j++) col_perm[j] = j;
 
@@ -82,8 +82,8 @@ static permuted_dense *kron_BT_alloc(const permuted_dense *A, int p)
 {
     int n = A->n0;
     int m = A->m0;
-    int *row_perm = (int *) SP_MALLOC(n * sizeof(int));
-    int *col_perm = (int *) SP_MALLOC(m * sizeof(int));
+    int *row_perm = (int *) sp_malloc(n * sizeof(int));
+    int *col_perm = (int *) sp_malloc(m * sizeof(int));
     for (int j = 0; j < n; j++) row_perm[j] = j;
     for (int i = 0; i < m; i++) col_perm[i] = i;
     matrix *BT_m = new_permuted_dense(n * p, m * p, n, m, row_perm, col_perm, NULL);
@@ -136,8 +136,8 @@ static matrix *kron_alloc_blockwise(permuted_dense *state, int p, int Cm, int Cn
     }
 
     permuted_dense **C_blocks =
-        (permuted_dense **) SP_MALLOC(p * sizeof(permuted_dense *));
-    int *C_src = (int *) SP_MALLOC(p * sizeof(int));
+        (permuted_dense **) sp_malloc(p * sizeof(permuted_dense *));
+    int *C_src = (int *) sp_malloc(p * sizeof(int));
 
     int out_nb = 0;
     int last_k = 0;
@@ -159,7 +159,7 @@ static matrix *kron_alloc_blockwise(permuted_dense *state, int p, int Cm, int Cn
         }
     }
 
-    int *C_src_p = (int *) SP_MALLOC((out_nb + 1) * sizeof(int));
+    int *C_src_p = (int *) sp_malloc((out_nb + 1) * sizeof(int));
     for (int k = 0; k <= out_nb; k++) C_src_p[k] = k;
 
     matrix *C = new_stacked_pd(Cm, Cn, out_nb, C_blocks, C_src_p, C_src);
diff --git a/src/utils/stacked_pd_linalg.c b/src/utils/stacked_pd_linalg.c
index 6d1f9ae..8df8bc7 100644
--- a/src/utils/stacked_pd_linalg.c
+++ b/src/utils/stacked_pd_linalg.c
@@ -35,7 +35,7 @@ matrix *copy_sparsity_spd_alloc(const stacked_pd *A)
 {
     int n_blocks = A->n_blocks;
     permuted_dense **C_blocks =
-        (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *));
+        (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *));
     for (int k = 0; k < n_blocks; k++)
     {
         C_blocks[k] = (permuted_dense *) copy_sparsity_pd_alloc(A->blocks[k]);
@@ -67,7 +67,7 @@ matrix *transpose_spd_alloc(const stacked_pd *A)
 {
     int n_blocks = A->n_blocks;
     permuted_dense **AT_blocks =
-        (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *));
+        (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *));
     for (int k = 0; k < n_blocks; k++)
     {
         AT_blocks[k] = (permuted_dense *) transpose_pd_alloc(A->blocks[k]);
@@ -126,7 +126,7 @@ static matrix *spd_blockwise_alloc_coalesce(const stacked_pd *spd_iter, int Cm,
 {
     int n_blocks = spd_iter->n_blocks;
     permuted_dense **partials =
-        (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *));
+        (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *));
     for (int k = 0; k < n_blocks; k++)
     {
         partials[k] = (permuted_dense *) op(spd_iter->blocks[k], ctx);
@@ -217,8 +217,8 @@ matrix *BTA_pd_spd_alloc(const permuted_dense *B, const stacked_pd *A)
     // Find column permutations of C. An A-block contributes if its row_perm overlaps
     // B's row_perm.
     // ------------------------------------------------------------------------------
-    const int **col_perms = (const int **) SP_MALLOC(n_blocks * sizeof(int *));
-    int *lens = (int *) SP_MALLOC(n_blocks * sizeof(int));
+    const int **col_perms = (const int **) sp_malloc(n_blocks * sizeof(int *));
+    int *lens = (int *) sp_malloc(n_blocks * sizeof(int));
 
     int n_contributing_A_blocks = 0;
     for (int k = 0; k < n_blocks; k++)
@@ -266,7 +266,7 @@ matrix *BTA_pd_spd_alloc(const permuted_dense *B, const stacked_pd *A)
     }
 
     C_pd->kernel_iwork_size = (size_t) 2 + (size_t) 2 * s_max + (size_t) max_n0_A;
-    C_pd->kernel_iwork = (int *) SP_MALLOC(C_pd->kernel_iwork_size * sizeof(int));
+    C_pd->kernel_iwork = (int *) sp_malloc(C_pd->kernel_iwork_size * sizeof(int));
     C_pd->kernel_iwork[0] = s_max;
     C_pd->kernel_iwork[1] = max_n0_A;
 
diff --git a/src/utils/tracked_alloc.c b/src/utils/tracked_alloc.c
index cb834be..9b0f4d8 100644
--- a/src/utils/tracked_alloc.c
+++ b/src/utils/tracked_alloc.c
@@ -18,3 +18,4 @@
 #include "utils/tracked_alloc.h"
 
 size_t g_allocated_bytes = 0;
+size_t g_peak_bytes = 0;
diff --git a/src/utils/utils.c b/src/utils/utils.c
index 213b4e2..6eb105d 100644
--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@@ -86,7 +86,7 @@ void sorted_union_int_arrays(const int *const *arrs, const int *lens, int n_arrs
                              iVec *out)
 {
     iVec_clear_no_resize(out);
-    int *cursor = (int *) SP_CALLOC(n_arrs, sizeof(int));
+    int *cursor = (int *) sp_calloc(n_arrs, sizeof(int));
     while (1)
     {
         int min_val = 0;
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 9e7ad45..77a9d12 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -110,6 +110,7 @@
 #include "profiling/profile_hessian_exp_AX.h"
 #include "profiling/profile_left_matmul.h"
 #include "profiling/profile_log_reg.h"
+#include "profiling/profile_memory.h"
 #include "profiling/profile_trimmed_log_reg.h"
 #endif /* PROFILE_ONLY */
 
@@ -536,6 +537,7 @@ int main(void)
     mu_run_test(profile_trimmed_log_reg, tests_run);
     mu_run_test(profile_BTA_pd_csr_vs_csc, tests_run);
     mu_run_test(profile_hessian_exp_AX, tests_run);
+    mu_run_test(profile_memory, tests_run);
 #endif /* PROFILE_ONLY */
 
     printf("\n=== All %d tests passed ===\n", tests_run);
diff --git a/tests/problem/test_param_broadcast.h b/tests/problem/test_param_broadcast.h
index 377121a..c64d64b 100644
--- a/tests/problem/test_param_broadcast.h
+++ b/tests/problem/test_param_broadcast.h
@@ -192,8 +192,7 @@ const char *test_const_sum_scalar_mult(void)
 
     problem_init_derivatives(prob);
 
-    /* point for evaluating */
-    double x_vals[1] = {4.0};
+    double x_vals[6] = {4.0, 0.0, 0.0, 0.0, 0.0, 0.0};
 
     problem_constraint_forward(prob, x_vals);
     double constrs[1] = {6.0 * 4.0};
@@ -226,8 +225,7 @@ const char *test_param_sum_scalar_mult(void)
     problem_register_params(prob, param_nodes, 1);
     problem_init_derivatives(prob);
 
-    /* point for evaluating */
-    double x_vals[1] = {4.0};
+    double x_vals[6] = {4.0, 0.0, 0.0, 0.0, 0.0, 0.0};
 
     problem_constraint_forward(prob, x_vals);
     double constrs[1] = {6.0 * 4.0};
diff --git a/tests/profiling/profile_hessian_exp_AX.h b/tests/profiling/profile_hessian_exp_AX.h
index eb75020..a5cdafc 100644
--- a/tests/profiling/profile_hessian_exp_AX.h
+++ b/tests/profiling/profile_hessian_exp_AX.h
@@ -42,9 +42,9 @@ const char *profile_hessian_exp_AX(void)
 
     /* Random A (row-major), X (col-major vec), w. Values in [-0.5, 0.5]
        to keep exp() bounded and finite. */
-    double *A_data = (double *) SP_MALLOC(n_vars * sizeof(double));
-    double *X_vals = (double *) SP_MALLOC(n_vars * sizeof(double));
-    double *w = (double *) SP_MALLOC(n_vars * sizeof(double));
+    double *A_data = (double *) sp_malloc(n_vars * sizeof(double));
+    double *X_vals = (double *) sp_malloc(n_vars * sizeof(double));
+    double *w = (double *) sp_malloc(n_vars * sizeof(double));
     for (int k = 0; k < n_vars; k++)
     {
         A_data[k] = ((double) rand() / (double) RAND_MAX) - 0.5;
@@ -86,7 +86,7 @@ const char *profile_hessian_exp_AX(void)
     /* AX_cm[j*n + i] = sum_k A_data[i*n + k] * X_vals[j*n + k]      */
     /*                = A[i, :] · X[:, j].                           */
     /* ------------------------------------------------------------ */
-    double *AX_cm = (double *) SP_MALLOC(n_vars * sizeof(double));
+    double *AX_cm = (double *) sp_malloc(n_vars * sizeof(double));
     for (int j = 0; j < n; j++)
     {
         for (int i = 0; i < n; i++)
@@ -99,7 +99,7 @@ const char *profile_hessian_exp_AX(void)
             AX_cm[j * n + i] = s;
         }
     }
-    double *d = (double *) SP_MALLOC(n_vars * sizeof(double));
+    double *d = (double *) sp_malloc(n_vars * sizeof(double));
     for (int k = 0; k < n_vars; k++)
     {
         d[k] = w[k] * exp(AX_cm[k]);
@@ -114,8 +114,8 @@ const char *profile_hessian_exp_AX(void)
     Timer t2a;
     clock_gettime(CLOCK_MONOTONIC, &t2a.start);
     permuted_dense **j_blocks =
-        (permuted_dense **) SP_MALLOC(n * sizeof(permuted_dense *));
-    int *row_perm = (int *) SP_MALLOC(n * sizeof(int));
+        (permuted_dense **) sp_malloc(n * sizeof(permuted_dense *));
+    int *row_perm = (int *) sp_malloc(n * sizeof(int));
     for (int k = 0; k < n; k++)
     {
         for (int i = 0; i < n; i++)
@@ -140,8 +140,8 @@ const char *profile_hessian_exp_AX(void)
     /* Densify both Hessians to a 2500 x 2500 row-major buffer and   */
     /* compute max-abs diff.                                         */
     /* ------------------------------------------------------------ */
-    double *D1 = (double *) SP_CALLOC(N, sizeof(double));
-    double *D2 = (double *) SP_CALLOC(N, sizeof(double));
+    double *D1 = (double *) sp_calloc(N, sizeof(double));
+    double *D2 = (double *) sp_calloc(N, sizeof(double));
 
     CSR_matrix *csr = H1->to_csr(H1);
     for (int i = 0; i < csr->m; i++)
diff --git a/tests/profiling/profile_memory.h b/tests/profiling/profile_memory.h
new file mode 100644
index 0000000..fd9946b
--- /dev/null
+++ b/tests/profiling/profile_memory.h
@@ -0,0 +1,59 @@
+#ifndef PROFILE_MEMORY_H
+#define PROFILE_MEMORY_H
+
+#include <stdlib.h>
+
+#include "atoms/affine.h"
+#include "atoms/bivariate_full_dom.h"
+#include "atoms/elementwise_full_dom.h"
+#include "expr.h"
+#include "problem.h"
+
+/* Reproduces the Python test
+       n = 100, rank = 10
+       A in R^{n x n}, B, C in R^{n x rank}
+       cost = sum(exp(A @ (B @ C.T)))
+   and lets free_problem print the SparseDiff banner so we can compare
+   the C "Peak memory" number against the Python report. Bounds on B and C
+   from the Python snippet are solver-level, so they are omitted here
+   (matching the "Number of constraints: 0" line in the Python output). */
+const char *profile_memory(void)
+{
+    const int n = 100;
+    const int rank = 10;
+    const int n_vars = 2 * n * rank;
+    srand(0);
+
+    double *A_data = (double *) malloc((size_t) n * n * sizeof(double));
+    double *u = (double *) malloc((size_t) n_vars * sizeof(double));
+    for (int k = 0; k < n * n; k++)
+    {
+        A_data[k] = ((double) rand() / (double) RAND_MAX) - 0.5;
+    }
+    for (int k = 0; k < n_vars; k++)
+    {
+        u[k] = ((double) rand() / (double) RAND_MAX) - 0.5;
+    }
+
+    expr *B = new_variable(n, rank, 0, n_vars);
+    expr *C = new_variable(n, rank, n * rank, n_vars);
+    expr *CT = new_transpose(C);
+    expr *BCT = new_matmul(B, CT);
+    expr *AX = new_left_matmul_dense(NULL, BCT, n, n, A_data);
+    expr *e = new_exp(AX);
+    expr *cost = new_sum(e, -1);
+
+    problem *prob = new_problem(cost, NULL, 0, /*verbose=*/true);
+
+    problem_init_derivatives(prob);
+    problem_objective_forward(prob, u);
+    problem_gradient(prob);
+    problem_hessian(prob, 1.0, NULL);
+
+    free_problem(prob);
+    free(A_data);
+    free(u);
+    return 0;
+}
+
+#endif /* PROFILE_MEMORY_H */
diff --git a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
index d8b943b..296dd3c 100644
--- a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
+++ b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h
@@ -300,13 +300,13 @@ const char *test_wsum_hess_AX_BX_multiply(void)
 
 const char *test_wsum_hess_multiply_deep_composite(void)
 {
-    double u_vals[4] = {1.0, 2.0, 3.0, 4.0};
+    double u_vals[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
     double w[4] = {1.1, 2.2, 3.3, 4.4};
 
     CSR_matrix *A = new_csr_random(2, 2, 1.0);
     CSR_matrix *B = new_csr_random(2, 2, 1.0);
     expr *X = new_variable(2, 2, 0, 8);
-    expr *Y = new_variable(2, 2, 0, 8);
+    expr *Y = new_variable(2, 2, 4, 8);
     expr *AX = new_left_matmul(NULL, X, A);
     expr *BY = new_left_matmul(NULL, Y, B);
     expr *sin_AX = new_sin(AX);