diff --git a/include/utils/tracked_alloc.h b/include/utils/tracked_alloc.h index adc982d..33b7077 100644 --- a/include/utils/tracked_alloc.h +++ b/include/utils/tracked_alloc.h @@ -21,20 +21,60 @@ #include #include -extern size_t g_allocated_bytes; +/* Platform shim for "how many usable bytes are at this malloc'd pointer". + Used to track total live bytes */ +#if defined(__APPLE__) +#include +#define TRACKED_BLOCK_SIZE(p) malloc_size(p) +#elif defined(_WIN32) || defined(_WIN64) +#include +#define TRACKED_BLOCK_SIZE(p) _msize(p) +#else +#include +#define TRACKED_BLOCK_SIZE(p) malloc_usable_size(p) +#endif -static inline void *SP_MALLOC(size_t size) +extern size_t g_allocated_bytes; /* current live bytes */ +extern size_t g_peak_bytes; /* high-water mark since last reset */ + +/* All allocations in src/ must go through these wrappers (and pair sp_free + with sp_malloc / sp_calloc). Tests may use plain malloc/free โ€” those + bytes are simply not tracked. */ +static inline void *sp_malloc(size_t size) { void *ptr = malloc(size); - if (ptr) g_allocated_bytes += size; + if (ptr) + { + g_allocated_bytes += TRACKED_BLOCK_SIZE(ptr); + if (g_allocated_bytes > g_peak_bytes) + { + g_peak_bytes = g_allocated_bytes; + } + } return ptr; } -static inline void *SP_CALLOC(size_t count, size_t size) +static inline void *sp_calloc(size_t count, size_t size) { void *ptr = calloc(count, size); - if (ptr) g_allocated_bytes += count * size; + if (ptr) + { + g_allocated_bytes += TRACKED_BLOCK_SIZE(ptr); + if (g_allocated_bytes > g_peak_bytes) + { + g_peak_bytes = g_allocated_bytes; + } + } return ptr; } +static inline void sp_free(void *ptr) +{ + if (ptr) + { + g_allocated_bytes -= TRACKED_BLOCK_SIZE(ptr); + free(ptr); + } +} + #endif /* TRACKED_ALLOC_H */ diff --git a/src/atoms/affine/add.c b/src/atoms/affine/add.c index 288bfde..7f434e0 100644 --- a/src/atoms/affine/add.c +++ b/src/atoms/affine/add.c @@ -95,7 +95,7 @@ static bool is_affine(const expr *node) expr *new_add(expr *left, expr *right) { assert(left->d1 == right->d1 && left->d2 == right->d2); - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, left->d1, left->d2, left->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); node->left = left; diff --git a/src/atoms/affine/broadcast.c b/src/atoms/affine/broadcast.c index 213461b..f5792f8 100644 --- a/src/atoms/affine/broadcast.c +++ b/src/atoms/affine/broadcast.c @@ -96,7 +96,7 @@ static void wsum_hess_init_impl(expr *node) node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess); /* allocate space for weight vector */ - node->work->dwork = SP_MALLOC(node->size * sizeof(double)); + node->work->dwork = sp_malloc(node->size * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) @@ -176,7 +176,7 @@ expr *new_broadcast(expr *child, int d1, int d2) exit(1); } - broadcast_expr *bcast = (broadcast_expr *) SP_CALLOC(1, sizeof(broadcast_expr)); + broadcast_expr *bcast = (broadcast_expr *) sp_calloc(1, sizeof(broadcast_expr)); expr *node = (expr *) bcast; // -------------------------------------------------------------------------- diff --git a/src/atoms/affine/convolve.c b/src/atoms/affine/convolve.c index 9cdfeb7..1698057 100644 --- a/src/atoms/affine/convolve.c +++ b/src/atoms/affine/convolve.c @@ -82,7 +82,8 @@ static void jacobian_init_impl(expr *node) conv_matrix_fill_values(cnode->T, a); /* J = T @ J_child */ - cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian->to_csr(child->jacobian), node->work->iwork); + cnode->Jchild_CSC = csr_to_csc_alloc(child->jacobian->to_csr(child->jacobian), + node->work->iwork); node->jacobian = new_sparse_matrix(csr_csc_matmul_alloc(cnode->T, cnode->Jchild_CSC)); } @@ -95,8 +96,8 @@ static void eval_jacobian(expr *node) child->eval_jacobian(child); /* J = T @ J_child */ - csr_to_csc_fill_values(child->jacobian->to_csr(child->jacobian), cnode->Jchild_CSC, - node->work->iwork); + csr_to_csc_fill_values(child->jacobian->to_csr(child->jacobian), + cnode->Jchild_CSC, node->work->iwork); csr_csc_matmul_fill_values(cnode->T, cnode->Jchild_CSC, node->jacobian->to_csr(node->jacobian)); } @@ -108,7 +109,7 @@ static void wsum_hess_init_impl(expr *node) wsum_hess_init(child); node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess); - node->work->dwork = (double *) SP_MALLOC(cnode->n * sizeof(double)); + node->work->dwork = (double *) sp_malloc(cnode->n * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) @@ -174,7 +175,7 @@ expr *new_convolve(expr *param_node, expr *child) exit(1); } - convolve_expr *cnode = (convolve_expr *) SP_CALLOC(1, sizeof(convolve_expr)); + convolve_expr *cnode = (convolve_expr *) sp_calloc(1, sizeof(convolve_expr)); expr *node = &cnode->base; init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, @@ -188,7 +189,7 @@ expr *new_convolve(expr *param_node, expr *child) cnode->n = n; /* iwork is used for csr_to_csc of child's Jacobian (size n_vars). */ - node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int)); + node->work->iwork = (int *) sp_malloc(node->n_vars * sizeof(int)); /* Ensure first forward() pulls current param values through any broadcast/promote wrappers and reflects them in T (once T is built). */ diff --git a/src/atoms/affine/diag_vec.c b/src/atoms/affine/diag_vec.c index 03fbd45..d5d74a0 100644 --- a/src/atoms/affine/diag_vec.c +++ b/src/atoms/affine/diag_vec.c @@ -70,7 +70,7 @@ static void wsum_hess_init_impl(expr *node) wsum_hess_init(x); /* workspace for extracting diagonal weights */ - node->work->dwork = (double *) SP_CALLOC(x->size, sizeof(double)); + node->work->dwork = (double *) sp_calloc(x->size, sizeof(double)); /* Copy child's Hessian structure (diag_vec is linear, so its own Hessian is * zero) */ @@ -106,7 +106,7 @@ expr *new_diag_vec(expr *child) /* n is the number of elements (works for both row and column vectors) */ int n = child->size; - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, n, n, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); node->left = child; diff --git a/src/atoms/affine/hstack.c b/src/atoms/affine/hstack.c index a34f8a4..5bbe8fe 100644 --- a/src/atoms/affine/hstack.c +++ b/src/atoms/affine/hstack.c @@ -187,14 +187,14 @@ expr *new_hstack(expr **args, int n_args, int n_vars) } /* Allocate the type-specific struct */ - hstack_expr *hnode = (hstack_expr *) SP_CALLOC(1, sizeof(hstack_expr)); + hstack_expr *hnode = (hstack_expr *) sp_calloc(1, sizeof(hstack_expr)); expr *node = &hnode->base; init_expr(node, args[0]->d1, d2, n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, wsum_hess_eval, free_type_data); /* Set type-specific fields (deep copy args array) */ - hnode->args = (expr **) SP_CALLOC(n_args, sizeof(expr *)); + hnode->args = (expr **) sp_calloc(n_args, sizeof(expr *)); hnode->n_args = n_args; for (int i = 0; i < n_args; i++) { diff --git a/src/atoms/affine/index.c b/src/atoms/affine/index.c index 3fd070e..c41f79c 100644 --- a/src/atoms/affine/index.c +++ b/src/atoms/affine/index.c @@ -29,7 +29,7 @@ * Returns true if duplicates exist, false otherwise. */ static bool check_for_duplicates(const int *indices, int n_idxs, int max_idx) { - bool *seen = (bool *) SP_CALLOC(max_idx, sizeof(bool)); + bool *seen = (bool *) sp_calloc(max_idx, sizeof(bool)); bool has_dup = false; for (int i = 0; i < n_idxs && !has_dup; i++) { @@ -89,7 +89,7 @@ static void wsum_hess_init_impl(expr *node) wsum_hess_init(x); /* for setting weight vector to evaluate hessian of child */ - node->work->dwork = (double *) SP_CALLOC(x->size, sizeof(double)); + node->work->dwork = (double *) sp_calloc(x->size, sizeof(double)); /* in the implementation of eval_wsum_hess we evaluate the child's hessian with a weight vector that has w[i] = 0 @@ -148,7 +148,7 @@ expr *new_index(expr *child, int d1, int d2, const int *indices, int n_idxs) { assert(d1 * d2 == n_idxs); /* allocate type-specific struct */ - index_expr *idx = (index_expr *) SP_CALLOC(1, sizeof(index_expr)); + index_expr *idx = (index_expr *) sp_calloc(1, sizeof(index_expr)); expr *node = &idx->base; init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl, @@ -159,7 +159,7 @@ expr *new_index(expr *child, int d1, int d2, const int *indices, int n_idxs) expr_retain(child); /* copy indices */ - idx->indices = (int *) SP_MALLOC(n_idxs * sizeof(int)); + idx->indices = (int *) sp_malloc(n_idxs * sizeof(int)); memcpy(idx->indices, indices, n_idxs * sizeof(int)); idx->n_idxs = n_idxs; diff --git a/src/atoms/affine/left_matmul.c b/src/atoms/affine/left_matmul.c index e3e0550..1336ddc 100644 --- a/src/atoms/affine/left_matmul.c +++ b/src/atoms/affine/left_matmul.c @@ -201,7 +201,7 @@ static void wsum_hess_init_impl(expr *node) /* work for computing A^T w*/ int n_blocks = ((left_matmul_expr *) node)->n_blocks; int dim = ((left_matmul_expr *) node)->AT->m * n_blocks; - node->work->dwork = (double *) SP_MALLOC(dim * sizeof(double)); + node->work->dwork = (double *) sp_malloc(dim * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) @@ -263,7 +263,7 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A) /* Allocate the type-specific struct */ left_matmul_expr *lnode = - (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr)); + (left_matmul_expr *) sp_calloc(1, sizeof(left_matmul_expr)); expr *node = &lnode->base; /* Sparse A โ€” always the general CSC-mirror path. */ init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_sparse, @@ -276,8 +276,8 @@ expr *new_left_matmul(expr *param_node, expr *u, const CSR_matrix *A) (requiring size node->n_vars). csc_to_csr_work is used for converting J_CSC to CSR_matrix (requiring node->size) */ - node->work->iwork = (int *) SP_MALLOC(node->n_vars * sizeof(int)); - lnode->csc_to_csr_work = (int *) SP_MALLOC(node->size * sizeof(int)); + node->work->iwork = (int *) sp_malloc(node->n_vars * sizeof(int)); + lnode->csc_to_csr_work = (int *) sp_malloc(node->size * sizeof(int)); lnode->n_blocks = n_blocks; /* store A and AT. new_sparse_matrix takes ownership, so clone first. */ @@ -304,7 +304,7 @@ expr *new_left_matmul_dense(expr *param_node, expr *u, int m, int n, compute_dims(u, m, n, &d1, &d2, &n_blocks); left_matmul_expr *lnode = - (left_matmul_expr *) SP_CALLOC(1, sizeof(left_matmul_expr)); + (left_matmul_expr *) sp_calloc(1, sizeof(left_matmul_expr)); expr *node = &lnode->base; init_expr(node, d1, d2, u->n_vars, forward, jacobian_init_dense, eval_jacobian_dense, is_affine, wsum_hess_init_impl, eval_wsum_hess, diff --git a/src/atoms/affine/neg.c b/src/atoms/affine/neg.c index 01bbf5e..a42c305 100644 --- a/src/atoms/affine/neg.c +++ b/src/atoms/affine/neg.c @@ -85,7 +85,7 @@ static bool is_affine(const expr *node) expr *new_neg(expr *child) { - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, child->d1, child->d2, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); node->left = child; diff --git a/src/atoms/affine/parameter.c b/src/atoms/affine/parameter.c index 0b9ffbe..a757edc 100644 --- a/src/atoms/affine/parameter.c +++ b/src/atoms/affine/parameter.c @@ -61,7 +61,7 @@ static bool is_affine(const expr *node) expr *new_parameter(int d1, int d2, int param_id, int n_vars, const double *values) { - parameter_expr *pnode = (parameter_expr *) SP_CALLOC(1, sizeof(parameter_expr)); + parameter_expr *pnode = (parameter_expr *) sp_calloc(1, sizeof(parameter_expr)); expr *node = &pnode->base; init_expr(node, d1, d2, n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); diff --git a/src/atoms/affine/promote.c b/src/atoms/affine/promote.c index 1a031d8..529f846 100644 --- a/src/atoms/affine/promote.c +++ b/src/atoms/affine/promote.c @@ -84,7 +84,7 @@ static bool is_affine(const expr *node) expr *new_promote(expr *child, int d1, int d2) { assert(child->size == 1); - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); node->left = child; diff --git a/src/atoms/affine/reshape.c b/src/atoms/affine/reshape.c index e59a7ec..5517112 100644 --- a/src/atoms/affine/reshape.c +++ b/src/atoms/affine/reshape.c @@ -69,7 +69,7 @@ static bool is_affine(const expr *node) expr *new_reshape(expr *child, int d1, int d2) { assert(d1 * d2 == child->size); - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, d1, d2, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); node->left = child; diff --git a/src/atoms/affine/right_matmul.c b/src/atoms/affine/right_matmul.c index 5e3b39f..f76db91 100644 --- a/src/atoms/affine/right_matmul.c +++ b/src/atoms/affine/right_matmul.c @@ -58,7 +58,7 @@ expr *new_right_matmul(expr *param_node, expr *u, const CSR_matrix *A) { /* We can express right matmul using left matmul and transpose: u @ A = (A^T @ u^T)^T. */ - int *work_transpose = (int *) SP_MALLOC(A->n * sizeof(int)); + int *work_transpose = (int *) sp_malloc(A->n * sizeof(int)); CSR_matrix *AT = transpose(A, work_transpose); expr *u_transpose = new_transpose(u); @@ -104,7 +104,7 @@ expr *new_right_matmul_dense(expr *param_node, expr *u, int m, int n, } else { - double *AT = (double *) SP_MALLOC(n * m * sizeof(double)); + double *AT = (double *) sp_malloc(n * m * sizeof(double)); A_transpose(AT, data, m, n); left_matmul_node = new_left_matmul_dense(NULL, u_transpose, n, m, AT); free(AT); diff --git a/src/atoms/affine/scalar_mult.c b/src/atoms/affine/scalar_mult.c index 433f3e1..dbf8455 100644 --- a/src/atoms/affine/scalar_mult.c +++ b/src/atoms/affine/scalar_mult.c @@ -118,7 +118,7 @@ static void free_type_data(expr *node) expr *new_scalar_mult(expr *param_node, expr *child) { scalar_mult_expr *mult_node = - (scalar_mult_expr *) SP_CALLOC(1, sizeof(scalar_mult_expr)); + (scalar_mult_expr *) sp_calloc(1, sizeof(scalar_mult_expr)); expr *node = &mult_node->base; init_expr(node, child->d1, child->d2, child->n_vars, forward, jacobian_init_impl, diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c index 5de6b80..f042d11 100644 --- a/src/atoms/affine/sum.c +++ b/src/atoms/affine/sum.c @@ -93,8 +93,8 @@ static void jacobian_init_impl(expr *node) /* we never have to store more than the child's nnz */ CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, Jx->nnz); - node->work->iwork = SP_MALLOC(MAX(jac->n, Jx->nnz) * sizeof(int)); - snode->idx_map = SP_MALLOC(Jx->nnz * sizeof(int)); + node->work->iwork = sp_malloc(MAX(jac->n, Jx->nnz) * sizeof(int)); + snode->idx_map = sp_malloc(Jx->nnz * sizeof(int)); /* the idx_map array maps each nonzero entry j in x->jacobian to the corresponding index in the output row matrix C. Specifically, for @@ -150,7 +150,7 @@ static void wsum_hess_init_impl(expr *node) /* we never have to store more than the child's nnz */ node->wsum_hess = child->wsum_hess->copy_sparsity(child->wsum_hess); - node->work->dwork = SP_MALLOC(child->size * sizeof(double)); + node->work->dwork = sp_malloc(child->size * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) @@ -210,7 +210,7 @@ expr *new_sum(expr *child, int axis) } /* Allocate the type-specific struct */ - sum_expr *snode = (sum_expr *) SP_CALLOC(1, sizeof(sum_expr)); + sum_expr *snode = (sum_expr *) sp_calloc(1, sizeof(sum_expr)); expr *node = &snode->base; /* to be consistent with CVXPY and NumPy we treat the result from diff --git a/src/atoms/affine/trace.c b/src/atoms/affine/trace.c index 846a45b..2dca9a5 100644 --- a/src/atoms/affine/trace.c +++ b/src/atoms/affine/trace.c @@ -72,14 +72,14 @@ static void jacobian_init_impl(expr *node) // fill sparsity pattern and idx_map // --------------------------------------------------------------- trace_expr *tnode = (trace_expr *) node; - node->work->iwork = SP_MALLOC(MAX(jac->n, total_nnz) * sizeof(int)); + node->work->iwork = sp_malloc(MAX(jac->n, total_nnz) * sizeof(int)); /* the idx_map array maps each nonzero entry j in the original matrix A (from the selected, evenly spaced rows) to the corresponding index in the output row matrix C. Specifically, for each nonzero entry j in A (from the selected rows), idx_map[j] gives the position in C->x where the value from A->x[j] should be accumulated. */ - tnode->idx_map = SP_MALLOC(A->nnz * sizeof(int)); + tnode->idx_map = sp_malloc(A->nnz * sizeof(int)); sum_spaced_rows_into_row_csr_alloc(A, jac, row_spacing, node->work->iwork, tnode->idx_map); node->jacobian = new_sparse_matrix(jac); @@ -107,7 +107,7 @@ static void wsum_hess_init_impl(expr *node) /* initialize child's hessian */ wsum_hess_init(x); - node->work->dwork = (double *) SP_CALLOC(x->size, sizeof(double)); + node->work->dwork = (double *) sp_calloc(x->size, sizeof(double)); /* We copy over the sparsity pattern from the child. This also includes the contribution to wsum_hess of entries of the child that will always have @@ -148,7 +148,7 @@ static void free_type_data(expr *node) expr *new_trace(expr *child) { - trace_expr *tnode = (trace_expr *) SP_CALLOC(1, sizeof(trace_expr)); + trace_expr *tnode = (trace_expr *) sp_calloc(1, sizeof(trace_expr)); expr *node = &tnode->base; init_expr(node, 1, 1, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data); diff --git a/src/atoms/affine/transpose.c b/src/atoms/affine/transpose.c index 59c961e..4760376 100644 --- a/src/atoms/affine/transpose.c +++ b/src/atoms/affine/transpose.c @@ -44,7 +44,7 @@ static void jacobian_init_impl(expr *node) /* The transpose's Jacobian is a row permutation of the child's: J_node[r, :] = J_child[k(r), :] where k(r) = (r/d1) + (r%d1)*d2. */ - int *indices = (int *) SP_MALLOC(n_out * sizeof(int)); + int *indices = (int *) sp_malloc(n_out * sizeof(int)); for (int r = 0; r < n_out; r++) { indices[r] = (r / d1) + (r % d1) * d2; @@ -74,7 +74,7 @@ static void wsum_hess_init_impl(expr *node) node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess); /* for computing Kw where K is the commutation matrix */ - node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double)); + node->work->dwork = (double *) sp_malloc(node->size * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) { @@ -104,7 +104,7 @@ static bool is_affine(const expr *node) expr *new_transpose(expr *child) { - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, child->d2, child->d1, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); node->left = child; diff --git a/src/atoms/affine/variable.c b/src/atoms/affine/variable.c index fe010e7..3291829 100644 --- a/src/atoms/affine/variable.c +++ b/src/atoms/affine/variable.c @@ -66,7 +66,7 @@ static bool is_affine(const expr *node) expr *new_variable(int d1, int d2, int var_id, int n_vars) { - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, d1, d2, n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, wsum_hess_eval, NULL); node->var_id = var_id; diff --git a/src/atoms/affine/vector_mult.c b/src/atoms/affine/vector_mult.c index ee7992b..6a3b2cf 100644 --- a/src/atoms/affine/vector_mult.c +++ b/src/atoms/affine/vector_mult.c @@ -85,7 +85,7 @@ static void wsum_hess_init_impl(expr *node) node->wsum_hess = x->wsum_hess->copy_sparsity(x->wsum_hess); /* workspace for storing scaled weights */ - node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double)); + node->work->dwork = (double *) sp_malloc(node->size * sizeof(double)); } static void eval_wsum_hess(expr *node, const double *w) @@ -124,7 +124,7 @@ static bool is_affine(const expr *node) expr *new_vector_mult(expr *param_node, expr *child) { vector_mult_expr *vnode = - (vector_mult_expr *) SP_CALLOC(1, sizeof(vector_mult_expr)); + (vector_mult_expr *) sp_calloc(1, sizeof(vector_mult_expr)); expr *node = &vnode->base; init_expr(node, child->d1, child->d2, child->n_vars, forward, jacobian_init_impl, diff --git a/src/atoms/affine/vstack.c b/src/atoms/affine/vstack.c index 18e1376..87ecb9b 100644 --- a/src/atoms/affine/vstack.c +++ b/src/atoms/affine/vstack.c @@ -35,7 +35,7 @@ expr *new_vstack(expr **args, int n_args, int n_vars) assert(args[i]->d2 == args[0]->d2); } - expr **transposed = (expr **) SP_MALLOC(n_args * sizeof(expr *)); + expr **transposed = (expr **) sp_malloc(n_args * sizeof(expr *)); for (int i = 0; i < n_args; i++) { transposed[i] = new_transpose(args[i]); diff --git a/src/atoms/bivariate_full_dom/matmul.c b/src/atoms/bivariate_full_dom/matmul.c index 4180327..f75d1e1 100644 --- a/src/atoms/bivariate_full_dom/matmul.c +++ b/src/atoms/bivariate_full_dom/matmul.c @@ -433,12 +433,12 @@ static void wsum_hess_init_chain_rule(expr *node) mnode->B = build_cross_hessian_sparsity(m, k, n); mnode->BJg = csr_csc_matmul_alloc(mnode->B, Jg); int max_alloc = MAX(mnode->BJg->m, mnode->BJg->n); - mnode->BJg_csc_work = (int *) SP_MALLOC(max_alloc * sizeof(int)); + mnode->BJg_csc_work = (int *) sp_malloc(max_alloc * sizeof(int)); mnode->BJg_CSC = csr_to_csc_alloc(mnode->BJg, mnode->BJg_csc_work); mnode->C = BTA_alloc(mnode->BJg_CSC, Jf); /* initialize C^T */ - node->work->iwork = (int *) SP_MALLOC(mnode->C->m * sizeof(int)); + node->work->iwork = (int *) sp_malloc(mnode->C->m * sizeof(int)); mnode->CT = AT_alloc(mnode->C, node->work->iwork); /* initialize Hessians of children */ @@ -475,7 +475,7 @@ static void wsum_hess_init_chain_rule(expr *node) if (!f->is_affine(f) || !g->is_affine(g)) { node->work->dwork = - (double *) SP_MALLOC(MAX(f->size, g->size) * sizeof(double)); + (double *) sp_malloc(MAX(f->size, g->size) * sizeof(double)); } } @@ -560,7 +560,7 @@ expr *new_matmul(expr *x, expr *y) } /* Allocate the expression node */ - expr *node = (expr *) SP_CALLOC(1, sizeof(matmul_expr)); + expr *node = (expr *) sp_calloc(1, sizeof(matmul_expr)); /* Choose no-chain-rule or chain-rule function pointers */ bool use_chain_rule = !(x->var_id != NOT_A_VARIABLE && diff --git a/src/atoms/bivariate_full_dom/multiply.c b/src/atoms/bivariate_full_dom/multiply.c index 325f9a4..93c8e2e 100644 --- a/src/atoms/bivariate_full_dom/multiply.c +++ b/src/atoms/bivariate_full_dom/multiply.c @@ -149,7 +149,7 @@ static void wsum_hess_init_impl(expr *node) /* used for computing weights to wsum_hess of children */ if (!x->is_affine(x) || !y->is_affine(y)) { - node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double)); + node->work->dwork = (double *) sp_malloc(node->size * sizeof(double)); } /* For sparse matrices we need the CSC cache to be valid for the @@ -318,7 +318,7 @@ static bool is_affine(const expr *node) expr *new_elementwise_mult(expr *left, expr *right) { elementwise_mult_expr *mul_node = - (elementwise_mult_expr *) SP_CALLOC(1, sizeof(elementwise_mult_expr)); + (elementwise_mult_expr *) sp_calloc(1, sizeof(elementwise_mult_expr)); expr *node = &mul_node->base; init_expr(node, left->d1, left->d2, left->n_vars, forward, jacobian_init_impl, diff --git a/src/atoms/bivariate_restricted_dom/quad_over_lin.c b/src/atoms/bivariate_restricted_dom/quad_over_lin.c index 12be5a1..4a49843 100644 --- a/src/atoms/bivariate_restricted_dom/quad_over_lin.c +++ b/src/atoms/bivariate_restricted_dom/quad_over_lin.c @@ -84,10 +84,10 @@ static void jacobian_init_impl(expr *node) } else /* left node is not a variable (guaranteed to be a linear operator) */ { - node->work->dwork = (double *) SP_MALLOC(x->size * sizeof(double)); + node->work->dwork = (double *) sp_malloc(x->size * sizeof(double)); /* compute required allocation and allocate jacobian */ - bool *col_nz = (bool *) SP_CALLOC( + bool *col_nz = (bool *) sp_calloc( node->n_vars, sizeof(bool)); /* TODO: could use iwork here instead*/ CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian); int nonzero_cols = count_nonzero_cols(Jx, col_nz); @@ -114,7 +114,7 @@ static void jacobian_init_impl(expr *node) jac->p[1] = jac->nnz; /* find position where y should be inserted */ - node->work->iwork = (int *) SP_MALLOC(sizeof(int)); + node->work->iwork = (int *) sp_malloc(sizeof(int)); for (int j = 0; j < jac->nnz; j++) { if (jac->i[j] == y->var_id) @@ -340,7 +340,7 @@ expr *new_quad_over_lin(expr *left, expr *right) exit(EXIT_FAILURE); } - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, 1, 1, left->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, NULL); node->left = left; diff --git a/src/atoms/bivariate_restricted_dom/rel_entr.c b/src/atoms/bivariate_restricted_dom/rel_entr.c index ecd4520..3dca836 100644 --- a/src/atoms/bivariate_restricted_dom/rel_entr.c +++ b/src/atoms/bivariate_restricted_dom/rel_entr.c @@ -206,7 +206,7 @@ expr *new_rel_entr_vector_args(expr *left, expr *right) exit(EXIT_FAILURE); } - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, left->d1, left->d2, left->n_vars, forward_vector_args, jacobian_init_vectors_args, eval_jacobian_vector_args, is_affine, wsum_hess_init_vector_args, eval_wsum_hess_vector_args, NULL); diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c index a356289..7216063 100644 --- a/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c +++ b/src/atoms/bivariate_restricted_dom/rel_entr_scalar_vector.c @@ -105,8 +105,7 @@ static void wsum_hess_init_scalar_vector(expr *node) int var_id_x = x->var_id; int var_id_y = y->var_id; - CSR_matrix *H = - new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1); + CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1); if (var_id_x < var_id_y) { @@ -220,7 +219,7 @@ static bool is_affine(const expr *node) expr *new_rel_entr_first_arg_scalar(expr *left, expr *right) { assert(left->d1 == 1 && left->d2 == 1); - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, right->d1, right->d2, left->n_vars, forward_scalar_vector, jacobian_init_scalar_vector, eval_jacobian_scalar_vector, is_affine, wsum_hess_init_scalar_vector, eval_wsum_hess_scalar_vector, NULL); diff --git a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c index f2b7928..12177c6 100644 --- a/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c +++ b/src/atoms/bivariate_restricted_dom/rel_entr_vector_scalar.c @@ -105,8 +105,7 @@ static void wsum_hess_init_vector_scalar(expr *node) int var_id_x = x->var_id; int var_id_y = y->var_id; - CSR_matrix *H = - new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1); + CSR_matrix *H = new_CSR_matrix(node->n_vars, node->n_vars, 3 * node->size + 1); if (var_id_x < var_id_y) { @@ -221,7 +220,7 @@ static bool is_affine(const expr *node) expr *new_rel_entr_second_arg_scalar(expr *left, expr *right) { assert(right->d1 == 1 && right->d2 == 1); - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); init_expr(node, left->d1, left->d2, left->n_vars, forward_vector_scalar, jacobian_init_vector_scalar, eval_jacobian_vector_scalar, is_affine, wsum_hess_init_vector_scalar, eval_wsum_hess_vector_scalar, NULL); diff --git a/src/atoms/elementwise_full_dom/common.c b/src/atoms/elementwise_full_dom/common.c index 352f784..c834b73 100644 --- a/src/atoms/elementwise_full_dom/common.c +++ b/src/atoms/elementwise_full_dom/common.c @@ -47,9 +47,9 @@ void jacobian_init_elementwise(expr *node) /* jacobian of h(x) = f(g(x)) is Jf @ Jg, and here Jf is diagonal */ jacobian_init(child); node->jacobian = child->jacobian->copy_sparsity(child->jacobian); - node->work->dwork = (double *) SP_MALLOC(node->size * sizeof(double)); + node->work->dwork = (double *) sp_malloc(node->size * sizeof(double)); node->work->local_jac_diag = - (double *) SP_MALLOC(node->size * sizeof(double)); + (double *) sp_malloc(node->size * sizeof(double)); } } @@ -118,8 +118,7 @@ void wsum_hess_init_elementwise(expr *node) child->wsum_hess->copy_sparsity(child->wsum_hess); /* wsum_hess = term1 + term2 */ - int max_nnz = - node->work->hess_term1->nnz + node->work->hess_term2->nnz; + int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz; node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz); sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2, @@ -175,8 +174,8 @@ void eval_wsum_hess_elementwise(expr *node, const double *w) child->wsum_hess->nnz * sizeof(double)); /* wsum_hess = term1 + term2 */ - sum_matrices_fill_values(node->work->hess_term1, - node->work->hess_term2, node->wsum_hess); + sum_matrices_fill_values(node->work->hess_term1, node->work->hess_term2, + node->wsum_hess); } } } @@ -202,7 +201,7 @@ void init_elementwise(expr *node, expr *child) expr *new_elementwise(expr *child) { - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); if (!node) return NULL; init_elementwise(node, child); diff --git a/src/atoms/elementwise_full_dom/power.c b/src/atoms/elementwise_full_dom/power.c index cde9c34..bdd227c 100644 --- a/src/atoms/elementwise_full_dom/power.c +++ b/src/atoms/elementwise_full_dom/power.c @@ -61,7 +61,7 @@ static void local_wsum_hess(expr *node, double *out, const double *w) expr *new_power(expr *child, double p) { /* Allocate the type-specific struct */ - power_expr *pnode = (power_expr *) SP_CALLOC(1, sizeof(power_expr)); + power_expr *pnode = (power_expr *) sp_calloc(1, sizeof(power_expr)); expr *node = &pnode->base; init_elementwise(node, child); node->forward = forward; diff --git a/src/atoms/elementwise_restricted_dom/common.c b/src/atoms/elementwise_restricted_dom/common.c index 5c88733..bd90c95 100644 --- a/src/atoms/elementwise_restricted_dom/common.c +++ b/src/atoms/elementwise_restricted_dom/common.c @@ -63,7 +63,7 @@ bool is_affine_restricted(const expr *node) expr *new_restricted(expr *child) { - expr *node = (expr *) SP_CALLOC(1, sizeof(expr)); + expr *node = (expr *) sp_calloc(1, sizeof(expr)); if (!node) return NULL; init_expr(node, child->d1, child->d2, child->n_vars, NULL, diff --git a/src/atoms/other/prod.c b/src/atoms/other/prod.c index aeb7226..3feab4a 100644 --- a/src/atoms/other/prod.c +++ b/src/atoms/other/prod.c @@ -217,7 +217,7 @@ I think they return row vectors.*/ expr *new_prod(expr *child) { /* Output is scalar: 1 x 1 */ - prod_expr *pnode = (prod_expr *) SP_CALLOC(1, sizeof(prod_expr)); + prod_expr *pnode = (prod_expr *) sp_calloc(1, sizeof(prod_expr)); expr *node = &pnode->base; init_expr(node, 1, 1, child->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data); diff --git a/src/atoms/other/prod_axis_one.c b/src/atoms/other/prod_axis_one.c index 2e0e971..5ef7cc6 100644 --- a/src/atoms/other/prod_axis_one.c +++ b/src/atoms/other/prod_axis_one.c @@ -398,7 +398,7 @@ static void free_type_data(expr *node) expr *new_prod_axis_one(expr *child) { - prod_axis *pnode = (prod_axis *) SP_CALLOC(1, sizeof(prod_axis)); + prod_axis *pnode = (prod_axis *) sp_calloc(1, sizeof(prod_axis)); expr *node = &pnode->base; /* output is always a row vector 1 x d1 (one product per row) */ @@ -407,9 +407,9 @@ expr *new_prod_axis_one(expr *child) free_type_data); /* allocate arrays to store per-row statistics */ - pnode->num_of_zeros = (int *) SP_CALLOC(child->d1, sizeof(int)); - pnode->zero_index = (int *) SP_CALLOC(child->d1, sizeof(int)); - pnode->prod_nonzero = (double *) SP_CALLOC(child->d1, sizeof(double)); + pnode->num_of_zeros = (int *) sp_calloc(child->d1, sizeof(int)); + pnode->zero_index = (int *) sp_calloc(child->d1, sizeof(int)); + pnode->prod_nonzero = (double *) sp_calloc(child->d1, sizeof(double)); node->left = child; expr_retain(child); diff --git a/src/atoms/other/prod_axis_zero.c b/src/atoms/other/prod_axis_zero.c index 55bedd2..b1f51df 100644 --- a/src/atoms/other/prod_axis_zero.c +++ b/src/atoms/other/prod_axis_zero.c @@ -358,7 +358,7 @@ static void free_type_data(expr *node) /* TODO: refactor to remove diagonal entry as nonzero since it's always zero */ expr *new_prod_axis_zero(expr *child) { - prod_axis *pnode = (prod_axis *) SP_CALLOC(1, sizeof(prod_axis)); + prod_axis *pnode = (prod_axis *) sp_calloc(1, sizeof(prod_axis)); expr *node = &pnode->base; /* output is always a row vector 1 x d2 - TODO: is that correct? */ @@ -367,9 +367,9 @@ expr *new_prod_axis_zero(expr *child) free_type_data); /* allocate arrays to store per-column statistics */ - pnode->num_of_zeros = (int *) SP_CALLOC(child->d2, sizeof(int)); - pnode->zero_index = (int *) SP_CALLOC(child->d2, sizeof(int)); - pnode->prod_nonzero = (double *) SP_CALLOC(child->d2, sizeof(double)); + pnode->num_of_zeros = (int *) sp_calloc(child->d2, sizeof(int)); + pnode->zero_index = (int *) sp_calloc(child->d2, sizeof(int)); + pnode->prod_nonzero = (double *) sp_calloc(child->d2, sizeof(double)); node->left = child; expr_retain(child); diff --git a/src/atoms/other/quad_form.c b/src/atoms/other/quad_form.c index d49c16b..d02e52c 100644 --- a/src/atoms/other/quad_form.c +++ b/src/atoms/other/quad_form.c @@ -18,8 +18,8 @@ #include "atoms/non_elementwise_full_dom.h" #include "subexpr.h" #include "utils/CSC_matrix.h" -#include "utils/matrix_sum.h" #include "utils/cblas_wrapper.h" +#include "utils/matrix_sum.h" #include "utils/sparse_matrix.h" #include "utils/tracked_alloc.h" #include @@ -107,8 +107,8 @@ static void eval_jacobian(expr *node) if (!x->work->jacobian_csc_filled) { - csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), x->work->jacobian_csc, - x->work->csc_work); + csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), + x->work->jacobian_csc, x->work->csc_work); if (x->is_affine(x)) { @@ -173,8 +173,7 @@ static void wsum_hess_init_impl(expr *node) node->work->hess_term2 = x->wsum_hess->copy_sparsity(x->wsum_hess); /* hess = term1 + term2 */ - int max_nnz = - node->work->hess_term1->nnz + node->work->hess_term2->nnz; + int max_nnz = node->work->hess_term1->nnz + node->work->hess_term2->nnz; node->wsum_hess = new_sparse_matrix_alloc(node->n_vars, node->n_vars, max_nnz); sum_matrices_alloc(node->work->hess_term1, node->work->hess_term2, @@ -201,7 +200,8 @@ static void eval_wsum_hess(expr *node, const double *w) CSC_matrix *Jf = x->work->jacobian_csc; if (!x->work->jacobian_csc_filled) { - csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jf, x->work->csc_work); + csr_to_csc_fill_values(x->jacobian->to_csr(x->jacobian), Jf, + x->work->csc_work); if (x->is_affine(x)) { @@ -222,8 +222,10 @@ static void eval_wsum_hess(expr *node, const double *w) x->wsum_hess->nnz * sizeof(double)); /* scale both terms by 2w */ - cblas_dscal(node->work->hess_term1->nnz, two_w, node->work->hess_term1->x, 1); - cblas_dscal(node->work->hess_term2->nnz, two_w, node->work->hess_term2->x, 1); + cblas_dscal(node->work->hess_term1->nnz, two_w, node->work->hess_term1->x, + 1); + cblas_dscal(node->work->hess_term2->nnz, two_w, node->work->hess_term2->x, + 1); /* sum the two terms */ sum_matrices_fill_values(node->work->hess_term1, node->work->hess_term2, @@ -253,7 +255,7 @@ static bool is_affine(const expr *node) expr *new_quad_form(expr *left, CSR_matrix *Q) { assert(left->d1 == 1 || left->d2 == 1); /* left must be a vector */ - quad_form_expr *qnode = (quad_form_expr *) SP_CALLOC(1, sizeof(quad_form_expr)); + quad_form_expr *qnode = (quad_form_expr *) sp_calloc(1, sizeof(quad_form_expr)); expr *node = &qnode->base; init_expr(node, 1, 1, left->n_vars, forward, jacobian_init_impl, eval_jacobian, @@ -266,6 +268,6 @@ expr *new_quad_form(expr *left, CSR_matrix *Q) copy_CSR_matrix(Q, qnode->Q); /* dwork stores the result of Q @ f(x) in the forward pass */ - node->work->dwork = (double *) SP_MALLOC(left->size * sizeof(double)); + node->work->dwork = (double *) sp_malloc(left->size * sizeof(double)); return node; } diff --git a/src/expr.c b/src/expr.c index b99b266..03212c2 100644 --- a/src/expr.c +++ b/src/expr.c @@ -32,7 +32,7 @@ void init_expr(expr *node, int d1, int d2, int n_vars, forward_fn forward, node->size = d1 * d2; node->n_vars = n_vars; node->refcount = 0; - node->value = (double *) SP_CALLOC(d1 * d2, sizeof(double)); + node->value = (double *) sp_calloc(d1 * d2, sizeof(double)); node->var_id = NOT_A_VARIABLE; node->forward = forward; node->jacobian_init_impl = jacobian_init; @@ -41,7 +41,7 @@ void init_expr(expr *node, int d1, int d2, int n_vars, forward_fn forward, node->wsum_hess_init_impl = wsum_hess_init; node->eval_wsum_hess = eval_wsum_hess; node->free_type_data = free_type_data; - node->work = (Expr_Work *) SP_CALLOC(1, sizeof(Expr_Work)); + node->work = (Expr_Work *) sp_calloc(1, sizeof(Expr_Work)); } void jacobian_csc_init(expr *node) @@ -50,9 +50,9 @@ void jacobian_csc_init(expr *node) { return; } - node->work->csc_work = (int *) SP_MALLOC(node->n_vars * sizeof(int)); - node->work->jacobian_csc = - csr_to_csc_alloc(node->jacobian->to_csr(node->jacobian), node->work->csc_work); + node->work->csc_work = (int *) sp_malloc(node->n_vars * sizeof(int)); + node->work->jacobian_csc = csr_to_csc_alloc( + node->jacobian->to_csr(node->jacobian), node->work->csc_work); } void free_expr(expr *node) diff --git a/src/old-code/linear_op.c b/src/old-code/linear_op.c index 8637ee6..b66a8d9 100644 --- a/src/old-code/linear_op.c +++ b/src/old-code/linear_op.c @@ -91,7 +91,7 @@ expr *new_linear(expr *u, const CSR_matrix *A, const double *b) assert(u->d2 == 1); /* Allocate the type-specific struct */ linear_op_expr *lin_node = - (linear_op_expr *) SP_CALLOC(1, sizeof(linear_op_expr)); + (linear_op_expr *) sp_calloc(1, sizeof(linear_op_expr)); expr *node = &lin_node->base; init_expr(node, A->m, 1, u->n_vars, forward, jacobian_init_impl, eval_jacobian, is_affine, wsum_hess_init_impl, eval_wsum_hess, free_type_data); @@ -106,7 +106,7 @@ expr *new_linear(expr *u, const CSR_matrix *A, const double *b) /* Initialize offset (copy b if provided, otherwise NULL) */ if (b != NULL) { - lin_node->b = (double *) SP_MALLOC(A->m * sizeof(double)); + lin_node->b = (double *) sp_malloc(A->m * sizeof(double)); memcpy(lin_node->b, b, A->m * sizeof(double)); } else diff --git a/src/old-code/old_permuted_dense.c b/src/old-code/old_permuted_dense.c index d32dc07..2f3deb1 100644 --- a/src/old-code/old_permuted_dense.c +++ b/src/old-code/old_permuted_dense.c @@ -30,7 +30,7 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A) /* Gather the union of columns appearing in A's rows at positions row_perm_B. Use a bitmap of size A->n for O(nnz) collection. */ int p = A->n; - char *seen = (char *) SP_CALLOC(p, sizeof(char)); + char *seen = (char *) sp_calloc(p, sizeof(char)); int s_A = 0; for (int kk = 0; kk < B->m0; kk++) { @@ -46,7 +46,7 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A) } } - int *col_active = (int *) SP_MALLOC((s_A > 0 ? s_A : 1) * sizeof(int)); + int *col_active = (int *) sp_malloc((s_A > 0 ? s_A : 1) * sizeof(int)); int idx = 0; for (int j = 0; j < p; j++) { @@ -71,7 +71,7 @@ matrix *BTA_pd_csr_alloc(const permuted_dense *B, const CSR_matrix *A) { free(C_pd->kernel_dwork); C_pd->kernel_dwork_size = gather_size; - C_pd->kernel_dwork = (double *) SP_CALLOC(gather_size, sizeof(double)); + C_pd->kernel_dwork = (double *) sp_calloc(gather_size, sizeof(double)); } return C; } @@ -163,7 +163,7 @@ matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A) /* Gather the union of columns appearing in B's rows at positions row_perm_A. Bitmap of size B_csr->n for O(nnz) collection. */ int q = B_csr->n; - char *seen = (char *) SP_CALLOC(q, sizeof(char)); + char *seen = (char *) sp_calloc(q, sizeof(char)); int r_B = 0; for (int kk = 0; kk < A->m0; kk++) { @@ -179,7 +179,7 @@ matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A) } } - int *row_active = (int *) SP_MALLOC((r_B > 0 ? r_B : 1) * sizeof(int)); + int *row_active = (int *) sp_malloc((r_B > 0 ? r_B : 1) * sizeof(int)); int idx = 0; for (int i = 0; i < q; i++) { @@ -203,7 +203,7 @@ matrix *BTA_csr_pd_alloc(const CSR_matrix *B_csr, const permuted_dense *A) { free(C_pd->kernel_dwork); C_pd->kernel_dwork_size = gather_size; - C_pd->kernel_dwork = (double *) SP_CALLOC(gather_size, sizeof(double)); + C_pd->kernel_dwork = (double *) sp_calloc(gather_size, sizeof(double)); } return C; } diff --git a/src/problem.c b/src/problem.c index 4a6bab9..c2ec84a 100644 --- a/src/problem.c +++ b/src/problem.c @@ -32,8 +32,11 @@ static void problem_lagrange_hess_fill_sparsity(problem *prob, int *iwork); problem *new_problem(expr *objective, expr **constraints, int n_constraints, bool verbose) { - g_allocated_bytes = 0; - problem *prob = (problem *) SP_CALLOC(1, sizeof(problem)); + /* we don't reset g_peak_bytes or g_allocated_bytes since allocations + using sp_malloc/sp_calloc might have happened before new_problem in eg., + left_matmul, and their frees will subtract from this counter. */ + g_peak_bytes = g_allocated_bytes; + problem *prob = (problem *) sp_calloc(1, sizeof(problem)); if (!prob) return NULL; /* objective */ @@ -47,7 +50,7 @@ problem *new_problem(expr *objective, expr **constraints, int n_constraints, prob->n_constraints = n_constraints; if (n_constraints > 0) { - prob->constraints = (expr **) SP_MALLOC(n_constraints * sizeof(expr *)); + prob->constraints = (expr **) sp_malloc(n_constraints * sizeof(expr *)); for (int i = 0; i < n_constraints; i++) { prob->constraints[i] = constraints[i]; @@ -58,8 +61,8 @@ problem *new_problem(expr *objective, expr **constraints, int n_constraints, /* allocation */ prob->constraint_values = - (double *) SP_CALLOC(prob->total_constraint_size, sizeof(double)); - prob->gradient_values = (double *) SP_CALLOC(prob->n_vars, sizeof(double)); + (double *) sp_calloc(prob->total_constraint_size, sizeof(double)); + prob->gradient_values = (double *) sp_calloc(prob->n_vars, sizeof(double)); /* Initialize statistics */ prob->stats.time_init_derivatives = 0.0; @@ -251,8 +254,8 @@ void problem_init_hessian(problem *prob) prob->lagrange_hessian = new_CSR_matrix(prob->n_vars, prob->n_vars, nnz); memset(prob->lagrange_hessian->x, 0, nnz * sizeof(double)); /* affine shortcut */ prob->stats.nnz_hessian = nnz; - prob->hess_idx_map = (int *) SP_MALLOC(nnz * sizeof(int)); - int *iwork = (int *) SP_MALLOC(MAX(nnz, prob->n_vars) * sizeof(int)); + prob->hess_idx_map = (int *) sp_malloc(nnz * sizeof(int)); + int *iwork = (int *) sp_malloc(MAX(nnz, prob->n_vars) * sizeof(int)); problem_lagrange_hess_fill_sparsity(prob, iwork); free(iwork); @@ -321,7 +324,7 @@ static inline void print_end_message(const Diff_engine_stats *stats) printf(" Lagrange Hessian (nnz): %d\n", stats->nnz_hessian); char mem_buf[64]; format_memory(stats->memory_bytes, mem_buf, sizeof(mem_buf)); - printf(" Allocated memory: %s\n", mem_buf); + printf(" Peak memory: %s\n", mem_buf); printf("\nTiming (seconds):\n"); printf(" Derivative structure (sparsity): %8.3f\n", @@ -349,7 +352,7 @@ void free_problem(problem *prob) if (prob->verbose) { - prob->stats.memory_bytes = g_allocated_bytes; + prob->stats.memory_bytes = g_peak_bytes; print_end_message(&prob->stats); } @@ -380,7 +383,7 @@ void free_problem(problem *prob) void problem_register_params(problem *prob, expr **param_nodes, int n_param_nodes) { prob->n_param_nodes = n_param_nodes; - prob->param_nodes = (expr **) SP_MALLOC(n_param_nodes * sizeof(expr *)); + prob->param_nodes = (expr **) sp_malloc(n_param_nodes * sizeof(expr *)); memcpy(prob->param_nodes, param_nodes, n_param_nodes * sizeof(expr *)); prob->total_parameter_size = 0; diff --git a/src/utils/COO_matrix.c b/src/utils/COO_matrix.c index 8602c09..0cfe2c4 100644 --- a/src/utils/COO_matrix.c +++ b/src/utils/COO_matrix.c @@ -22,13 +22,13 @@ COO_matrix *new_COO_matrix(const CSR_matrix *A) { - COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix)); + COO_matrix *coo = (COO_matrix *) sp_malloc(sizeof(COO_matrix)); coo->m = A->m; coo->n = A->n; coo->nnz = A->nnz; - coo->rows = (int *) SP_MALLOC(A->nnz * sizeof(int)); - coo->cols = (int *) SP_MALLOC(A->nnz * sizeof(int)); - coo->x = (double *) SP_MALLOC(A->nnz * sizeof(double)); + coo->rows = (int *) sp_malloc(A->nnz * sizeof(int)); + coo->cols = (int *) sp_malloc(A->nnz * sizeof(int)); + coo->x = (double *) sp_malloc(A->nnz * sizeof(double)); coo->value_map = NULL; for (int r = 0; r < A->m; r++) @@ -60,14 +60,14 @@ COO_matrix *new_COO_matrix_lower_triangular(const CSR_matrix *A) } } - COO_matrix *coo = (COO_matrix *) SP_MALLOC(sizeof(COO_matrix)); + COO_matrix *coo = (COO_matrix *) sp_malloc(sizeof(COO_matrix)); coo->m = A->m; coo->n = A->n; coo->nnz = count; - coo->rows = (int *) SP_MALLOC(count * sizeof(int)); - coo->cols = (int *) SP_MALLOC(count * sizeof(int)); - coo->x = (double *) SP_MALLOC(count * sizeof(double)); - coo->value_map = (int *) SP_MALLOC(count * sizeof(int)); + coo->rows = (int *) sp_malloc(count * sizeof(int)); + coo->cols = (int *) sp_malloc(count * sizeof(int)); + coo->x = (double *) sp_malloc(count * sizeof(double)); + coo->value_map = (int *) sp_malloc(count * sizeof(int)); /* Pass 2: fill arrays */ int idx = 0; diff --git a/src/utils/CSC_matrix.c b/src/utils/CSC_matrix.c index 08eec98..5fc5aa1 100644 --- a/src/utils/CSC_matrix.c +++ b/src/utils/CSC_matrix.c @@ -24,12 +24,12 @@ CSC_matrix *new_CSC_matrix(int m, int n, int nnz) { - CSC_matrix *matrix = (CSC_matrix *) SP_MALLOC(sizeof(CSC_matrix)); + CSC_matrix *matrix = (CSC_matrix *) sp_malloc(sizeof(CSC_matrix)); if (!matrix) return NULL; - matrix->p = (int *) SP_MALLOC((n + 1) * sizeof(int)); - matrix->i = (int *) SP_MALLOC(nnz * sizeof(int)); - matrix->x = (double *) SP_MALLOC(nnz * sizeof(double)); + matrix->p = (int *) sp_malloc((n + 1) * sizeof(int)); + matrix->i = (int *) sp_malloc(nnz * sizeof(int)); + matrix->x = (double *) sp_malloc(nnz * sizeof(double)); if (!matrix->p || !matrix->i || !matrix->x) { @@ -67,7 +67,7 @@ CSR_matrix *ATA_alloc(const CSC_matrix *A) int i, j, ii, jj; /* row ptr and column idxs for upper triangular part of C = A^T A */ - int *Cp = (int *) SP_MALLOC((n + 1) * sizeof(int)); + int *Cp = (int *) sp_malloc((n + 1) * sizeof(int)); iVec *Ci = iVec_new(m); Cp[0] = 0; @@ -341,7 +341,7 @@ CSR_matrix *BTA_alloc(const CSC_matrix *A, const CSC_matrix *B) int i, j, ii, jj; /* row ptr and column idxs for C = B^T A */ - int *Cp = (int *) SP_MALLOC((p + 1) * sizeof(int)); + int *Cp = (int *) sp_malloc((p + 1) * sizeof(int)); iVec *Ci = iVec_new(n); Cp[0] = 0; @@ -487,13 +487,13 @@ CSC_matrix *symBA_alloc(const CSR_matrix *B, const CSC_matrix *A) int i, j, k, jj, ii, ell; /* marker[row] = last column j that registered row as nonzero */ - int *marker = (int *) SP_MALLOC(m * sizeof(int)); + int *marker = (int *) sp_malloc(m * sizeof(int)); for (i = 0; i < m; i++) { marker[i] = -1; } - int *Cp = (int *) SP_MALLOC((n + 1) * sizeof(int)); + int *Cp = (int *) sp_malloc((n + 1) * sizeof(int)); iVec *Ci = iVec_new(A->nnz); Cp[0] = 0; diff --git a/src/utils/CSR_matrix.c b/src/utils/CSR_matrix.c index 1123068..132e58c 100644 --- a/src/utils/CSR_matrix.c +++ b/src/utils/CSR_matrix.c @@ -27,10 +27,10 @@ CSR_matrix *new_CSR_matrix(int m, int n, int nnz) { - CSR_matrix *matrix = (CSR_matrix *) SP_MALLOC(sizeof(CSR_matrix)); - matrix->p = (int *) SP_CALLOC(m + 1, sizeof(int)); - matrix->i = (int *) SP_CALLOC(nnz, sizeof(int)); - matrix->x = (double *) SP_MALLOC(nnz * sizeof(double)); + CSR_matrix *matrix = (CSR_matrix *) sp_malloc(sizeof(CSR_matrix)); + matrix->p = (int *) sp_calloc(m + 1, sizeof(int)); + matrix->i = (int *) sp_calloc(nnz, sizeof(int)); + matrix->x = (double *) sp_malloc(nnz * sizeof(double)); matrix->m = m; matrix->n = n; matrix->nnz = nnz; @@ -258,7 +258,7 @@ void symmetrize_csr(const int *Ap, const int *Ai, int m, CSR_matrix *C) int i, j, col; /* Count entries per row */ - int *counts = (int *) SP_CALLOC(m, sizeof(int)); + int *counts = (int *) sp_calloc(m, sizeof(int)); for (i = 0; i < m; i++) { for (j = Ap[i]; j < Ap[i + 1]; j++) diff --git a/src/utils/CSR_sum.c b/src/utils/CSR_sum.c index 286fd38..b30ef8d 100644 --- a/src/utils/CSR_sum.c +++ b/src/utils/CSR_sum.c @@ -373,7 +373,7 @@ CSR_matrix *sum_4_csr_alloc(const CSR_matrix *A, const CSR_matrix *B, CSR_matrix *out = new_CSR_matrix(m, n, nnz_ub); for (int k = 0; k < 4; k++) { - idx_maps[k] = (int *) SP_MALLOC(inputs[k]->nnz * sizeof(int)); + idx_maps[k] = (int *) sp_malloc(inputs[k]->nnz * sizeof(int)); } /* 4-way sorted merge per row */ diff --git a/src/utils/int_double_pair.c b/src/utils/int_double_pair.c index d03096a..8fe149d 100644 --- a/src/utils/int_double_pair.c +++ b/src/utils/int_double_pair.c @@ -31,7 +31,7 @@ static int compare_int_double_pair(const void *a, const void *b) int_double_pair *new_int_double_pair_array(int size) { - return (int_double_pair *) SP_MALLOC(size * sizeof(int_double_pair)); + return (int_double_pair *) sp_malloc(size * sizeof(int_double_pair)); } void set_int_double_pair_array(int_double_pair *pair, int *ints, double *doubles, diff --git a/src/utils/linalg_dense_sparse_matmuls.c b/src/utils/linalg_dense_sparse_matmuls.c index 22f133d..6a66841 100644 --- a/src/utils/linalg_dense_sparse_matmuls.c +++ b/src/utils/linalg_dense_sparse_matmuls.c @@ -15,11 +15,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "utils/linalg_dense_sparse_matmuls.h" #include "utils/CSC_matrix.h" #include "utils/CSR_matrix.h" #include "utils/cblas_wrapper.h" #include "utils/iVec.h" -#include "utils/linalg_dense_sparse_matmuls.h" #include "utils/tracked_alloc.h" #include #include @@ -35,7 +35,7 @@ CSC_matrix *I_kron_A_alloc(const matrix *A, const CSC_matrix *J, int p) int n = A->n; int i, j, jj, block, block_start, block_end, block_jj_start, row_offset; - int *Cp = (int *) SP_MALLOC((J->n + 1) * sizeof(int)); + int *Cp = (int *) sp_malloc((J->n + 1) * sizeof(int)); iVec *Ci = iVec_new(J->n * m); Cp[0] = 0; @@ -147,8 +147,8 @@ void I_kron_A_fill_values(const matrix *A, const CSC_matrix *J, CSC_matrix *C, j_dense[J->i[s] - block_start] = J->x[s]; } - cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A->x, n, - j_dense, 1, 0.0, C->x + i, 1); + cblas_dgemv(CblasRowMajor, CblasNoTrans, m, n, 1.0, A->x, n, j_dense, + 1, 0.0, C->x + i, 1); } } } @@ -174,7 +174,7 @@ CSR_matrix *YT_kron_I_alloc(int m, int k, int n, const CSC_matrix *J) // --------------------------------------------------------------- // build sparsity pattern per blk_row // --------------------------------------------------------------- - iVec **pattern = (iVec **) SP_MALLOC(m * sizeof(iVec *)); + iVec **pattern = (iVec **) sp_malloc(m * sizeof(iVec *)); total_nnz = 0; for (blk_row = 0; blk_row < m; blk_row++) { @@ -262,7 +262,7 @@ CSR_matrix *I_kron_X_alloc(int m, int k, int n, const CSC_matrix *J) * nonzero in row range [blk*k, blk*k + k). */ int i, j, ii, blk; - iVec **pattern = (iVec **) SP_MALLOC(n * sizeof(iVec *)); + iVec **pattern = (iVec **) sp_malloc(n * sizeof(iVec *)); int total_nnz = 0; for (blk = 0; blk < n; blk++) { diff --git a/src/utils/linalg_sparse_matmuls.c b/src/utils/linalg_sparse_matmuls.c index 7720f70..9ca83c5 100644 --- a/src/utils/linalg_sparse_matmuls.c +++ b/src/utils/linalg_sparse_matmuls.c @@ -96,7 +96,7 @@ CSC_matrix *block_left_multiply_fill_sparsity(const CSR_matrix *A, row_offset; /* allocate column pointers and an estimate of row indices */ - int *Cp = (int *) SP_MALLOC((J->n + 1) * sizeof(int)); + int *Cp = (int *) sp_malloc((J->n + 1) * sizeof(int)); iVec *Ci = iVec_new(J->n * m); Cp[0] = 0; @@ -250,8 +250,8 @@ void csr_csc_matmul_fill_values(const CSR_matrix *A, const CSC_matrix *B, } } -/* C = A @ B where A is CSR_matrix (m x n), B is CSC_matrix (n x p). Result C is CSR_matrix (m x p) - with precomputed sparsity pattern */ +/* C = A @ B where A is CSR_matrix (m x n), B is CSC_matrix (n x p). Result C is + CSR_matrix (m x p) with precomputed sparsity pattern */ CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B) { int m = A->m; @@ -259,7 +259,7 @@ CSR_matrix *csr_csc_matmul_alloc(const CSR_matrix *A, const CSC_matrix *B) int len_a, len_b; - int *Cp = (int *) SP_MALLOC((m + 1) * sizeof(int)); + int *Cp = (int *) sp_malloc((m + 1) * sizeof(int)); iVec *Ci = iVec_new(m); Cp[0] = 0; diff --git a/src/utils/matrix_sum.c b/src/utils/matrix_sum.c index c8b12b5..3bfc65a 100644 --- a/src/utils/matrix_sum.c +++ b/src/utils/matrix_sum.c @@ -33,6 +33,6 @@ void sum_matrices_fill_values(matrix *A, matrix *B, matrix *C) void sum_scaled_matrices_fill_values(matrix *A, matrix *B, matrix *C, const double *d1, const double *d2) { - sum_scaled_csr_matrices_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C), - d1, d2); + sum_scaled_csr_matrices_fill_values(A->to_csr(A), B->to_csr(B), C->to_csr(C), d1, + d2); } diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c index def3d19..eb058b9 100644 --- a/src/utils/permuted_dense.c +++ b/src/utils/permuted_dense.c @@ -111,7 +111,7 @@ static void permuted_dense_vtable_transpose_fill_values(const matrix *self, matrix *index_pd_alloc(const permuted_dense *A, const int *indices, int n_idxs) { /* Scan indices: which output positions i hit a row in A->row_perm? */ - int *new_row_perm = (int *) SP_MALLOC(n_idxs * sizeof(int)); + int *new_row_perm = (int *) sp_malloc(n_idxs * sizeof(int)); int new_m0 = 0; for (int i = 0; i < n_idxs; i++) { @@ -164,7 +164,7 @@ matrix *promote_pd_alloc(const permuted_dense *A, int size) NULL); } - int *new_row_perm = (int *) SP_MALLOC(size * sizeof(int)); + int *new_row_perm = (int *) sp_malloc(size * sizeof(int)); for (int i = 0; i < size; i++) { new_row_perm[i] = i; @@ -220,7 +220,7 @@ matrix *broadcast_pd_alloc(const permuted_dense *A, broadcast_type type, int d1, NULL); } - int *new_row_perm = (int *) SP_MALLOC(new_m0 * sizeof(int)); + int *new_row_perm = (int *) sp_malloc(new_m0 * sizeof(int)); int k = 0; if (type == BROADCAST_SCALAR) { @@ -319,7 +319,7 @@ matrix *diag_vec_pd_alloc(const permuted_dense *A) NULL); } - int *new_row_perm = (int *) SP_MALLOC(A->m0 * sizeof(int)); + int *new_row_perm = (int *) sp_malloc(A->m0 * sizeof(int)); for (int ii = 0; ii < A->m0; ii++) { new_row_perm[ii] = A->row_perm[ii] * (n + 1); @@ -444,7 +444,7 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm, assert(col_perm[0] >= 0 && col_perm[n0 - 1] < n); } - permuted_dense *pd = (permuted_dense *) SP_CALLOC(1, sizeof(permuted_dense)); + permuted_dense *pd = (permuted_dense *) sp_calloc(1, sizeof(permuted_dense)); pd->base.m = m; pd->base.n = n; pd->base.nnz = m0 * n0; @@ -454,13 +454,13 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm, pd->n0 = n0; int sz = m0 * n0; - pd->row_perm = (int *) SP_MALLOC(m0 * sizeof(int)); - pd->col_perm = (int *) SP_MALLOC(n0 * sizeof(int)); - pd->X = (double *) SP_MALLOC(sz * sizeof(double)); + pd->row_perm = (int *) sp_malloc(m0 * sizeof(int)); + pd->col_perm = (int *) sp_malloc(n0 * sizeof(int)); + pd->X = (double *) sp_malloc(sz * sizeof(double)); pd->base.x = pd->X; pd->owns_X = true; - pd->col_inv = (int *) SP_MALLOC(n * sizeof(int)); - pd->row_inv = (int *) SP_MALLOC(m * sizeof(int)); + pd->col_inv = (int *) sp_malloc(n * sizeof(int)); + pd->row_inv = (int *) sp_malloc(m * sizeof(int)); if (m0 > 0) { @@ -499,8 +499,8 @@ matrix *new_permuted_dense(int m, int n, int m0, int n0, const int *row_perm, matrix *new_permuted_dense_full(int m, int n, const double *data) { - int *row_perm = (int *) SP_MALLOC(m * sizeof(int)); - int *col_perm = (int *) SP_MALLOC(n * sizeof(int)); + int *row_perm = (int *) sp_malloc(m * sizeof(int)); + int *col_perm = (int *) sp_malloc(n * sizeof(int)); for (int i = 0; i < m; i++) row_perm[i] = i; for (int j = 0; j < n; j++) col_perm[j] = j; matrix *out = new_permuted_dense(m, n, m, n, row_perm, col_perm, data); @@ -548,6 +548,6 @@ void permuted_dense_ensure_kernel_dwork(const permuted_dense *pd_const, size_t s permuted_dense *pd = (permuted_dense *) pd_const; if (pd->kernel_dwork_size >= size) return; free(pd->kernel_dwork); - pd->kernel_dwork = (double *) SP_MALLOC(size * sizeof(double)); + pd->kernel_dwork = (double *) sp_malloc(size * sizeof(double)); pd->kernel_dwork_size = size; } diff --git a/src/utils/permuted_dense_linalg.c b/src/utils/permuted_dense_linalg.c index ca89f61..dc493e7 100644 --- a/src/utils/permuted_dense_linalg.c +++ b/src/utils/permuted_dense_linalg.c @@ -127,7 +127,7 @@ matrix *BTA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A) in iwork, hence 2 * s_max). */ permuted_dense *C_pd = (permuted_dense *) C; C_pd->kernel_iwork_size = (size_t) 2 * s_max; - C_pd->kernel_iwork = (int *) SP_MALLOC(C_pd->kernel_iwork_size * sizeof(int)); + C_pd->kernel_iwork = (int *) sp_malloc(C_pd->kernel_iwork_size * sizeof(int)); return C; } @@ -328,7 +328,7 @@ matrix *BA_pd_pd_alloc(const permuted_dense *B, const permuted_dense *A) same idiom as BTA_pd_pd_alloc. */ permuted_dense *C_pd = (permuted_dense *) C; C_pd->kernel_iwork_size = (size_t) 2 * s_max; - C_pd->kernel_iwork = (int *) SP_MALLOC(C_pd->kernel_iwork_size * sizeof(int)); + C_pd->kernel_iwork = (int *) sp_malloc(C_pd->kernel_iwork_size * sizeof(int)); return C; } diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c index 211ac31..91b2467 100644 --- a/src/utils/sparse_matrix.c +++ b/src/utils/sparse_matrix.c @@ -65,7 +65,7 @@ matrix *new_sparse_matrix(CSR_matrix *A); void sparse_matrix_ensure_csc_cache(sparse_matrix *sm) { if (sm->csc_cache != NULL) return; - sm->csc_iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int)); + sm->csc_iwork = (int *) sp_malloc(sm->csr->n * sizeof(int)); sm->csc_cache = csr_to_csc_alloc(sm->csr, sm->csc_iwork); } @@ -105,7 +105,7 @@ static CSR_matrix *sparse_to_csr(matrix *self) static matrix *sparse_transpose_alloc(const matrix *self) { const sparse_matrix *sm = (const sparse_matrix *) self; - int *iwork = (int *) SP_MALLOC(sm->csr->n * sizeof(int)); + int *iwork = (int *) sp_malloc(sm->csr->n * sizeof(int)); CSR_matrix *AT = AT_alloc(sm->csr, iwork); sparse_matrix *out = (sparse_matrix *) new_sparse_matrix(AT); out->transpose_iwork = iwork; @@ -301,7 +301,8 @@ static void sparse_diag_vec_fill_values(matrix *self, matrix *out) } } -/* Build CSC_matrix structure on first call; refill values from csr->x on every call. */ +/* Build CSC_matrix structure on first call; refill values from csr->x on every call. + */ static void sparse_refresh_csc_values(matrix *self) { sparse_matrix *sm = (sparse_matrix *) self; @@ -335,7 +336,7 @@ static void wire_vtable(sparse_matrix *sm) matrix *new_sparse_matrix(CSR_matrix *A) { - sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix)); + sparse_matrix *sm = (sparse_matrix *) sp_calloc(1, sizeof(sparse_matrix)); sm->base.m = A->m; sm->base.n = A->n; sm->base.nnz = A->nnz; @@ -353,7 +354,7 @@ matrix *new_sparse_matrix_alloc(int m, int n, int nnz) matrix *sparse_matrix_trans(const sparse_matrix *self, int *iwork) { CSR_matrix *AT = transpose(self->csr, iwork); - sparse_matrix *sm = (sparse_matrix *) SP_CALLOC(1, sizeof(sparse_matrix)); + sparse_matrix *sm = (sparse_matrix *) sp_calloc(1, sizeof(sparse_matrix)); sm->base.m = AT->m; sm->base.n = AT->n; sm->base.nnz = AT->nnz; diff --git a/src/utils/stacked_pd.c b/src/utils/stacked_pd.c index aacaaf0..83d87ab 100644 --- a/src/utils/stacked_pd.c +++ b/src/utils/stacked_pd.c @@ -157,8 +157,8 @@ matrix *spd_map_filter_blocks(const stacked_pd *B, int Cm, int Cn, spd_block_op } permuted_dense **C_blocks = - (permuted_dense **) SP_MALLOC(B->n_blocks * sizeof(permuted_dense *)); - int *C_src = (int *) SP_MALLOC(B->n_blocks * sizeof(int)); + (permuted_dense **) sp_malloc(B->n_blocks * sizeof(permuted_dense *)); + int *C_src = (int *) sp_malloc(B->n_blocks * sizeof(int)); int out_nb = 0; for (int k = 0; k < B->n_blocks; k++) @@ -176,7 +176,7 @@ matrix *spd_map_filter_blocks(const stacked_pd *B, int Cm, int Cn, spd_block_op } } - int *C_src_p = (int *) SP_MALLOC((out_nb + 1) * sizeof(int)); + int *C_src_p = (int *) sp_malloc((out_nb + 1) * sizeof(int)); for (int k = 0; k <= out_nb; k++) { C_src_p[k] = k; @@ -198,7 +198,7 @@ void compose_csr_idx_map_for_spd(const stacked_pd *spd, const CSR_matrix *csr, col_perm straight into csr->i, the n0 CSR entries for any row are exactly that block's columns in order โ€” CSR position is just csr->p[row] + jj. */ - int *scratch = (int *) SP_MALLOC(spd->base.nnz * sizeof(int)); + int *scratch = (int *) sp_malloc(spd->base.nnz * sizeof(int)); int native = 0; for (int k = 0; k < spd->n_blocks; k++) { @@ -283,7 +283,7 @@ static matrix *stacked_pd_vtable_diag_vec_alloc(matrix *self) { stacked_pd *A = (stacked_pd *) self; permuted_dense **tmp_blocks = - (permuted_dense **) SP_MALLOC(A->n_blocks * sizeof(permuted_dense *)); + (permuted_dense **) sp_malloc(A->n_blocks * sizeof(permuted_dense *)); for (int k = 0; k < A->n_blocks; k++) { tmp_blocks[k] = (permuted_dense *) diag_vec_pd_alloc(A->blocks[k]); @@ -409,7 +409,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl // -------------------------------------------------------------------------------- // Set up basic fields // -------------------------------------------------------------------------------- - stacked_pd *spd = (stacked_pd *) SP_CALLOC(1, sizeof(stacked_pd)); + stacked_pd *spd = (stacked_pd *) sp_calloc(1, sizeof(stacked_pd)); spd->base.m = m; spd->base.n = n; int nnz = 0; @@ -421,8 +421,8 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl wire_vtable(spd); spd->n_blocks = n_blocks; - spd->blocks = (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *)); - spd->src_block_idx_p = (int *) SP_MALLOC((n_blocks + 1) * sizeof(int)); + spd->blocks = (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *)); + spd->src_block_idx_p = (int *) sp_malloc((n_blocks + 1) * sizeof(int)); spd->src_block_idx_p[0] = 0; if (n_blocks > 0) { @@ -435,7 +435,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl if (src_block_idx_p == NULL) { /* identity: each output block has itself as its one source */ - spd->src_block_idx = (int *) SP_MALLOC(n_blocks * sizeof(int)); + spd->src_block_idx = (int *) sp_malloc(n_blocks * sizeof(int)); for (int k = 0; k < n_blocks; k++) { spd->src_block_idx_p[k + 1] = k + 1; @@ -446,7 +446,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl { int total = src_block_idx_p[n_blocks]; memcpy(spd->src_block_idx_p, src_block_idx_p, (n_blocks + 1) * sizeof(int)); - spd->src_block_idx = (int *) SP_MALLOC(total * sizeof(int)); + spd->src_block_idx = (int *) sp_malloc(total * sizeof(int)); if (total > 0) { memcpy(spd->src_block_idx, src_block_idx, total * sizeof(int)); @@ -456,7 +456,7 @@ matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **bl // --------------------------------------------------------------------------- // Absorb each block's X into a single shared values buffer owned by this spd. // ---------------------------------------------------------------------------- - spd->base.x = (double *) SP_MALLOC(spd->base.nnz * sizeof(double)); + spd->base.x = (double *) sp_malloc(spd->base.nnz * sizeof(double)); int offset = 0; for (int k = 0; k < n_blocks; k++) { diff --git a/src/utils/stacked_pd_coalesce.c b/src/utils/stacked_pd_coalesce.c index f123a45..99c5d31 100644 --- a/src/utils/stacked_pd_coalesce.c +++ b/src/utils/stacked_pd_coalesce.c @@ -39,7 +39,7 @@ typedef struct static int_csr *int_csr_create(int *p, int *data, int n) { - int_csr *csr = (int_csr *) SP_MALLOC(sizeof(int_csr)); + int_csr *csr = (int_csr *) sp_malloc(sizeof(int_csr)); csr->p = p; csr->data = data; csr->n = n; @@ -118,7 +118,7 @@ static int_csr *build_row_to_blocks_csr(const stacked_pd *A, int *scratch) int m = A->base.m; int n_blocks = A->n_blocks; - int *p = (int *) SP_CALLOC(m + 1, sizeof(int)); + int *p = (int *) sp_calloc(m + 1, sizeof(int)); for (int k = 0; k < n_blocks; k++) { permuted_dense *Ak = A->blocks[k]; @@ -129,7 +129,7 @@ static int_csr *build_row_to_blocks_csr(const stacked_pd *A, int *scratch) } cumsum(p, m); - int *data = (int *) SP_MALLOC(p[m] * sizeof(int)); + int *data = (int *) sp_malloc(p[m] * sizeof(int)); int *blocks_added_per_row = scratch; for (int k = 0; k < n_blocks; k++) { @@ -148,7 +148,7 @@ static void catalog_signatures(const int_csr *row_to_blocks, iVec *catalog_sig_p iVec *catalog_sig_data, int **row_to_sig_out) { int m = row_to_blocks->n; - int *row_to_sig = (int *) SP_MALLOC(m * sizeof(int)); + int *row_to_sig = (int *) sp_malloc(m * sizeof(int)); for (int i = 0; i < m; i++) { int signature_len; @@ -169,7 +169,7 @@ static void catalog_signatures(const int_csr *row_to_blocks, iVec *catalog_sig_p static int_csr *group_rows_by_signature(int m, int n_sigs, const int *row_to_sig, int *scratch) { - int *p = (int *) SP_CALLOC(n_sigs + 1, sizeof(int)); + int *p = (int *) sp_calloc(n_sigs + 1, sizeof(int)); for (int i = 0; i < m; i++) { if (row_to_sig[i] >= 0) @@ -179,7 +179,7 @@ static int_csr *group_rows_by_signature(int m, int n_sigs, const int *row_to_sig } cumsum(p, n_sigs); - int *data = (int *) SP_MALLOC(p[n_sigs] * sizeof(int)); + int *data = (int *) sp_malloc(p[n_sigs] * sizeof(int)); int *rows_added_per_sig = scratch; for (int i = 0; i < m; i++) { @@ -208,13 +208,13 @@ static permuted_dense **build_output_blocks(const stacked_pd *A, int n = A->base.n; int n_sigs = catalog_sig_p->len - 1; - permuted_dense **out_blocks = (permuted_dense **) SP_MALLOC( + permuted_dense **out_blocks = (permuted_dense **) sp_malloc( (n_sigs > 0 ? n_sigs : 1) * sizeof(permuted_dense *)); iVec *col_union = iVec_new(8); /* col_arrs / col_lens: scratch passed to sorted_union_int_arrays. Max sig size is bounded by A->n_blocks, so size once outside the loop. */ - const int **col_arrs = (const int **) SP_MALLOC(A->n_blocks * sizeof(int *)); - int *col_lens = (int *) SP_MALLOC(A->n_blocks * sizeof(int)); + const int **col_arrs = (const int **) sp_malloc(A->n_blocks * sizeof(int *)); + int *col_lens = (int *) sp_malloc(A->n_blocks * sizeof(int)); for (int s = 0; s < n_sigs; s++) { @@ -247,7 +247,7 @@ static matrix *coalesce_spd_alloc_impl(const stacked_pd *A, { int m = A->base.m; int n = A->base.n; - int *iwork = (int *) SP_CALLOC(m, sizeof(int)); + int *iwork = (int *) sp_calloc(m, sizeof(int)); // --------------------------------------------------------------------------- // For each global row in A, catalog which blocks contain it. diff --git a/src/utils/stacked_pd_kron_linalg.c b/src/utils/stacked_pd_kron_linalg.c index cd8b2de..679b7ce 100644 --- a/src/utils/stacked_pd_kron_linalg.c +++ b/src/utils/stacked_pd_kron_linalg.c @@ -56,8 +56,8 @@ static permuted_dense *kron_scratch_init(const permuted_dense *A, int p) int n = A->n0; /* Initial block-0 perms. */ - int *row_perm = (int *) SP_MALLOC(m * sizeof(int)); - int *col_perm = (int *) SP_MALLOC(n * sizeof(int)); + int *row_perm = (int *) sp_malloc(m * sizeof(int)); + int *col_perm = (int *) sp_malloc(n * sizeof(int)); for (int i = 0; i < m; i++) row_perm[i] = i; for (int j = 0; j < n; j++) col_perm[j] = j; @@ -82,8 +82,8 @@ static permuted_dense *kron_BT_alloc(const permuted_dense *A, int p) { int n = A->n0; int m = A->m0; - int *row_perm = (int *) SP_MALLOC(n * sizeof(int)); - int *col_perm = (int *) SP_MALLOC(m * sizeof(int)); + int *row_perm = (int *) sp_malloc(n * sizeof(int)); + int *col_perm = (int *) sp_malloc(m * sizeof(int)); for (int j = 0; j < n; j++) row_perm[j] = j; for (int i = 0; i < m; i++) col_perm[i] = i; matrix *BT_m = new_permuted_dense(n * p, m * p, n, m, row_perm, col_perm, NULL); @@ -136,8 +136,8 @@ static matrix *kron_alloc_blockwise(permuted_dense *state, int p, int Cm, int Cn } permuted_dense **C_blocks = - (permuted_dense **) SP_MALLOC(p * sizeof(permuted_dense *)); - int *C_src = (int *) SP_MALLOC(p * sizeof(int)); + (permuted_dense **) sp_malloc(p * sizeof(permuted_dense *)); + int *C_src = (int *) sp_malloc(p * sizeof(int)); int out_nb = 0; int last_k = 0; @@ -159,7 +159,7 @@ static matrix *kron_alloc_blockwise(permuted_dense *state, int p, int Cm, int Cn } } - int *C_src_p = (int *) SP_MALLOC((out_nb + 1) * sizeof(int)); + int *C_src_p = (int *) sp_malloc((out_nb + 1) * sizeof(int)); for (int k = 0; k <= out_nb; k++) C_src_p[k] = k; matrix *C = new_stacked_pd(Cm, Cn, out_nb, C_blocks, C_src_p, C_src); diff --git a/src/utils/stacked_pd_linalg.c b/src/utils/stacked_pd_linalg.c index 6d1f9ae..8df8bc7 100644 --- a/src/utils/stacked_pd_linalg.c +++ b/src/utils/stacked_pd_linalg.c @@ -35,7 +35,7 @@ matrix *copy_sparsity_spd_alloc(const stacked_pd *A) { int n_blocks = A->n_blocks; permuted_dense **C_blocks = - (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *)); + (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *)); for (int k = 0; k < n_blocks; k++) { C_blocks[k] = (permuted_dense *) copy_sparsity_pd_alloc(A->blocks[k]); @@ -67,7 +67,7 @@ matrix *transpose_spd_alloc(const stacked_pd *A) { int n_blocks = A->n_blocks; permuted_dense **AT_blocks = - (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *)); + (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *)); for (int k = 0; k < n_blocks; k++) { AT_blocks[k] = (permuted_dense *) transpose_pd_alloc(A->blocks[k]); @@ -126,7 +126,7 @@ static matrix *spd_blockwise_alloc_coalesce(const stacked_pd *spd_iter, int Cm, { int n_blocks = spd_iter->n_blocks; permuted_dense **partials = - (permuted_dense **) SP_MALLOC(n_blocks * sizeof(permuted_dense *)); + (permuted_dense **) sp_malloc(n_blocks * sizeof(permuted_dense *)); for (int k = 0; k < n_blocks; k++) { partials[k] = (permuted_dense *) op(spd_iter->blocks[k], ctx); @@ -217,8 +217,8 @@ matrix *BTA_pd_spd_alloc(const permuted_dense *B, const stacked_pd *A) // Find column permutations of C. An A-block contributes if its row_perm overlaps // B's row_perm. // ------------------------------------------------------------------------------ - const int **col_perms = (const int **) SP_MALLOC(n_blocks * sizeof(int *)); - int *lens = (int *) SP_MALLOC(n_blocks * sizeof(int)); + const int **col_perms = (const int **) sp_malloc(n_blocks * sizeof(int *)); + int *lens = (int *) sp_malloc(n_blocks * sizeof(int)); int n_contributing_A_blocks = 0; for (int k = 0; k < n_blocks; k++) @@ -266,7 +266,7 @@ matrix *BTA_pd_spd_alloc(const permuted_dense *B, const stacked_pd *A) } C_pd->kernel_iwork_size = (size_t) 2 + (size_t) 2 * s_max + (size_t) max_n0_A; - C_pd->kernel_iwork = (int *) SP_MALLOC(C_pd->kernel_iwork_size * sizeof(int)); + C_pd->kernel_iwork = (int *) sp_malloc(C_pd->kernel_iwork_size * sizeof(int)); C_pd->kernel_iwork[0] = s_max; C_pd->kernel_iwork[1] = max_n0_A; diff --git a/src/utils/tracked_alloc.c b/src/utils/tracked_alloc.c index cb834be..9b0f4d8 100644 --- a/src/utils/tracked_alloc.c +++ b/src/utils/tracked_alloc.c @@ -18,3 +18,4 @@ #include "utils/tracked_alloc.h" size_t g_allocated_bytes = 0; +size_t g_peak_bytes = 0; diff --git a/src/utils/utils.c b/src/utils/utils.c index 213b4e2..6eb105d 100644 --- a/src/utils/utils.c +++ b/src/utils/utils.c @@ -86,7 +86,7 @@ void sorted_union_int_arrays(const int *const *arrs, const int *lens, int n_arrs iVec *out) { iVec_clear_no_resize(out); - int *cursor = (int *) SP_CALLOC(n_arrs, sizeof(int)); + int *cursor = (int *) sp_calloc(n_arrs, sizeof(int)); while (1) { int min_val = 0; diff --git a/tests/all_tests.c b/tests/all_tests.c index 9e7ad45..77a9d12 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -110,6 +110,7 @@ #include "profiling/profile_hessian_exp_AX.h" #include "profiling/profile_left_matmul.h" #include "profiling/profile_log_reg.h" +#include "profiling/profile_memory.h" #include "profiling/profile_trimmed_log_reg.h" #endif /* PROFILE_ONLY */ @@ -536,6 +537,7 @@ int main(void) mu_run_test(profile_trimmed_log_reg, tests_run); mu_run_test(profile_BTA_pd_csr_vs_csc, tests_run); mu_run_test(profile_hessian_exp_AX, tests_run); + mu_run_test(profile_memory, tests_run); #endif /* PROFILE_ONLY */ printf("\n=== All %d tests passed ===\n", tests_run); diff --git a/tests/problem/test_param_broadcast.h b/tests/problem/test_param_broadcast.h index 377121a..c64d64b 100644 --- a/tests/problem/test_param_broadcast.h +++ b/tests/problem/test_param_broadcast.h @@ -192,8 +192,7 @@ const char *test_const_sum_scalar_mult(void) problem_init_derivatives(prob); - /* point for evaluating */ - double x_vals[1] = {4.0}; + double x_vals[6] = {4.0, 0.0, 0.0, 0.0, 0.0, 0.0}; problem_constraint_forward(prob, x_vals); double constrs[1] = {6.0 * 4.0}; @@ -226,8 +225,7 @@ const char *test_param_sum_scalar_mult(void) problem_register_params(prob, param_nodes, 1); problem_init_derivatives(prob); - /* point for evaluating */ - double x_vals[1] = {4.0}; + double x_vals[6] = {4.0, 0.0, 0.0, 0.0, 0.0, 0.0}; problem_constraint_forward(prob, x_vals); double constrs[1] = {6.0 * 4.0}; diff --git a/tests/profiling/profile_hessian_exp_AX.h b/tests/profiling/profile_hessian_exp_AX.h index eb75020..a5cdafc 100644 --- a/tests/profiling/profile_hessian_exp_AX.h +++ b/tests/profiling/profile_hessian_exp_AX.h @@ -42,9 +42,9 @@ const char *profile_hessian_exp_AX(void) /* Random A (row-major), X (col-major vec), w. Values in [-0.5, 0.5] to keep exp() bounded and finite. */ - double *A_data = (double *) SP_MALLOC(n_vars * sizeof(double)); - double *X_vals = (double *) SP_MALLOC(n_vars * sizeof(double)); - double *w = (double *) SP_MALLOC(n_vars * sizeof(double)); + double *A_data = (double *) sp_malloc(n_vars * sizeof(double)); + double *X_vals = (double *) sp_malloc(n_vars * sizeof(double)); + double *w = (double *) sp_malloc(n_vars * sizeof(double)); for (int k = 0; k < n_vars; k++) { A_data[k] = ((double) rand() / (double) RAND_MAX) - 0.5; @@ -86,7 +86,7 @@ const char *profile_hessian_exp_AX(void) /* AX_cm[j*n + i] = sum_k A_data[i*n + k] * X_vals[j*n + k] */ /* = A[i, :] ยท X[:, j]. */ /* ------------------------------------------------------------ */ - double *AX_cm = (double *) SP_MALLOC(n_vars * sizeof(double)); + double *AX_cm = (double *) sp_malloc(n_vars * sizeof(double)); for (int j = 0; j < n; j++) { for (int i = 0; i < n; i++) @@ -99,7 +99,7 @@ const char *profile_hessian_exp_AX(void) AX_cm[j * n + i] = s; } } - double *d = (double *) SP_MALLOC(n_vars * sizeof(double)); + double *d = (double *) sp_malloc(n_vars * sizeof(double)); for (int k = 0; k < n_vars; k++) { d[k] = w[k] * exp(AX_cm[k]); @@ -114,8 +114,8 @@ const char *profile_hessian_exp_AX(void) Timer t2a; clock_gettime(CLOCK_MONOTONIC, &t2a.start); permuted_dense **j_blocks = - (permuted_dense **) SP_MALLOC(n * sizeof(permuted_dense *)); - int *row_perm = (int *) SP_MALLOC(n * sizeof(int)); + (permuted_dense **) sp_malloc(n * sizeof(permuted_dense *)); + int *row_perm = (int *) sp_malloc(n * sizeof(int)); for (int k = 0; k < n; k++) { for (int i = 0; i < n; i++) @@ -140,8 +140,8 @@ const char *profile_hessian_exp_AX(void) /* Densify both Hessians to a 2500 x 2500 row-major buffer and */ /* compute max-abs diff. */ /* ------------------------------------------------------------ */ - double *D1 = (double *) SP_CALLOC(N, sizeof(double)); - double *D2 = (double *) SP_CALLOC(N, sizeof(double)); + double *D1 = (double *) sp_calloc(N, sizeof(double)); + double *D2 = (double *) sp_calloc(N, sizeof(double)); CSR_matrix *csr = H1->to_csr(H1); for (int i = 0; i < csr->m; i++) diff --git a/tests/profiling/profile_memory.h b/tests/profiling/profile_memory.h new file mode 100644 index 0000000..fd9946b --- /dev/null +++ b/tests/profiling/profile_memory.h @@ -0,0 +1,59 @@ +#ifndef PROFILE_MEMORY_H +#define PROFILE_MEMORY_H + +#include + +#include "atoms/affine.h" +#include "atoms/bivariate_full_dom.h" +#include "atoms/elementwise_full_dom.h" +#include "expr.h" +#include "problem.h" + +/* Reproduces the Python test + n = 100, rank = 10 + A in R^{n x n}, B, C in R^{n x rank} + cost = sum(exp(A @ (B @ C.T))) + and lets free_problem print the SparseDiff banner so we can compare + the C "Peak memory" number against the Python report. Bounds on B and C + from the Python snippet are solver-level, so they are omitted here + (matching the "Number of constraints: 0" line in the Python output). */ +const char *profile_memory(void) +{ + const int n = 100; + const int rank = 10; + const int n_vars = 2 * n * rank; + srand(0); + + double *A_data = (double *) malloc((size_t) n * n * sizeof(double)); + double *u = (double *) malloc((size_t) n_vars * sizeof(double)); + for (int k = 0; k < n * n; k++) + { + A_data[k] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + for (int k = 0; k < n_vars; k++) + { + u[k] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + + expr *B = new_variable(n, rank, 0, n_vars); + expr *C = new_variable(n, rank, n * rank, n_vars); + expr *CT = new_transpose(C); + expr *BCT = new_matmul(B, CT); + expr *AX = new_left_matmul_dense(NULL, BCT, n, n, A_data); + expr *e = new_exp(AX); + expr *cost = new_sum(e, -1); + + problem *prob = new_problem(cost, NULL, 0, /*verbose=*/true); + + problem_init_derivatives(prob); + problem_objective_forward(prob, u); + problem_gradient(prob); + problem_hessian(prob, 1.0, NULL); + + free_problem(prob); + free(A_data); + free(u); + return 0; +} + +#endif /* PROFILE_MEMORY_H */ diff --git a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h index d8b943b..296dd3c 100644 --- a/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h +++ b/tests/wsum_hess/composite/test_chain_rule_wsum_hess.h @@ -300,13 +300,13 @@ const char *test_wsum_hess_AX_BX_multiply(void) const char *test_wsum_hess_multiply_deep_composite(void) { - double u_vals[4] = {1.0, 2.0, 3.0, 4.0}; + double u_vals[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; double w[4] = {1.1, 2.2, 3.3, 4.4}; CSR_matrix *A = new_csr_random(2, 2, 1.0); CSR_matrix *B = new_csr_random(2, 2, 1.0); expr *X = new_variable(2, 2, 0, 8); - expr *Y = new_variable(2, 2, 0, 8); + expr *Y = new_variable(2, 2, 4, 8); expr *AX = new_left_matmul(NULL, X, A); expr *BY = new_left_matmul(NULL, Y, B); expr *sin_AX = new_sin(AX);