diff --git a/include/utils/matmul_dispatchers.h b/include/utils/matmul_dispatchers.h index 4f4ea28..8fd326b 100644 --- a/include/utils/matmul_dispatchers.h +++ b/include/utils/matmul_dispatchers.h @@ -29,8 +29,9 @@ Contract: for sparse_matrix operands (A or B), the caller is responsible for refreshing csc_cache values via refresh_csc_values before - BTDA_matrices_fill_values. */ + BTA_matrices_fill_values / BTDA_matrices_fill_values. */ matrix *BTA_matrices_alloc(matrix *A, matrix *B); +void BTA_matrices_fill_values(matrix *A, matrix *B, matrix *C); void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C); /* Polymorphic dispatchers for C = B @ A, where B is permuted_dense and A is any diff --git a/include/utils/permuted_dense_linalg.h b/include/utils/permuted_dense_linalg.h index b049a73..0d8de83 100644 --- a/include/utils/permuted_dense_linalg.h +++ b/include/utils/permuted_dense_linalg.h @@ -64,6 +64,10 @@ void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A, /* Allocate new permuted dense for C = B^T @ A where B is PD and A is CSC */ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A); +/* Fill values of C = B^T @ A where B is PD and A is CSC. */ +void BTA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A, + permuted_dense *C); + /* Fill values of C = B^T @ diag(d) @ A where B is PD and A is CSC */ void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d, const CSC_matrix *A, permuted_dense *C); @@ -71,6 +75,10 @@ void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d, /* Allocate new permuted_dense for C = B^T @ A where B is Sparse CSC and A is PD. */ matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A); +/* Fill values of C = B^T @ A where B is CSC and A is PD. */ +void BTA_csc_pd_fill_values(const CSC_matrix *B, const permuted_dense *A, + permuted_dense *C); + /* Fill values of C = B^T @ diag(d) @ A where B is CSC and A is PD */ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d, const permuted_dense *A, permuted_dense *C); diff --git a/include/utils/stacked_pd_linalg.h b/include/utils/stacked_pd_linalg.h index 8083fdf..2d50b27 100644 --- a/include/utils/stacked_pd_linalg.h +++ b/include/utils/stacked_pd_linalg.h @@ -61,6 +61,11 @@ void BTDA_pd_spd_fill_values(const permuted_dense *B, const double *d, permuted_dense. The output C is stacked_pd. */ matrix *BTA_spd_pd_alloc(const stacked_pd *B, const permuted_dense *A); +/* Fill values of C = B^T @ A where B is stacked_pd, A is permuted_dense, and + C is stacked_pd. */ +void BTA_spd_pd_fill_values(const stacked_pd *B, const permuted_dense *A, + stacked_pd *C); + /* Fill values of C = B^T @ diag(d) @ A where B is stacked_pd, A is permuted_dense, d is a global vector of length B->m, and C is stacked_pd. */ void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d, @@ -70,6 +75,11 @@ void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d, The output C is stacked_pd. */ matrix *BTA_spd_csc_alloc(const stacked_pd *B, const CSC_matrix *A); +/* Fill values of C = B^T @ A where B is stacked_pd and A is CSC, and C is + stacked_pd. C must be pre-allocated via BTA_spd_csc_alloc. */ +void BTA_spd_csc_fill_values(const stacked_pd *B, const CSC_matrix *A, + stacked_pd *C); + /* Fill values of C = B^T @ diag(d) @ A where B is stacked_pd, A is CSC, d is a global vector of length B->m, and C is stacked_pd. */ void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d, @@ -79,6 +89,11 @@ void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d, The output C is stacked_pd. */ matrix *BTA_spd_spd_alloc(const stacked_pd *B, const stacked_pd *A); +/* Fill values of C = B^T @ A where both B and A are stacked_pd, and C is + stacked_pd. C must be pre-allocated via BTA_spd_spd_alloc. */ +void BTA_spd_spd_fill_values(const stacked_pd *B, const stacked_pd *A, + stacked_pd *C); + /* Fill values of C = B^T @ diag(d) @ A where both B and A are stacked_pd, d is a global vector of length B->m, and C is stacked_pd. C must be pre-allocated via BTA_spd_spd_alloc. */ @@ -89,6 +104,11 @@ void BTDA_spd_spd_fill_values(const stacked_pd *B, const double *d, The output C is stacked_pd. */ matrix *BTA_csc_spd_alloc(const CSC_matrix *B, const stacked_pd *A); +/* Fill values of C = B^T @ A where B is CSC and A is stacked_pd, and C is + stacked_pd. C must be pre-allocated via BTA_csc_spd_alloc. */ +void BTA_csc_spd_fill_values(const CSC_matrix *B, const stacked_pd *A, + stacked_pd *C); + /* Fill values of C = B^T @ diag(d) @ A where B is CSC, A is stacked_pd, d is a global vector of length A->m, and C is stacked_pd. C must be pre-allocated via BTA_csc_spd_alloc. */ diff --git a/src/utils/matmul_dispatchers.c b/src/utils/matmul_dispatchers.c index 72967d6..6ba31bd 100644 --- a/src/utils/matmul_dispatchers.c +++ b/src/utils/matmul_dispatchers.c @@ -22,14 +22,20 @@ #include /* Forward declarations of the fixed-B dispatchers used internally by - BTA_matrices_alloc / BTDA_matrices_fill_values. */ + BTA_matrices_alloc / BTA_matrices_fill_values / BTDA_matrices_fill_values. */ static matrix *BTA_pd_matrices_alloc(const permuted_dense *B, matrix *A); +static void BTA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A, + permuted_dense *C); static void BTDA_pd_matrices_fill_values(const permuted_dense *B, const double *d, const matrix *A, permuted_dense *C); static matrix *BTA_spd_matrices_alloc(const stacked_pd *B, matrix *A); +static void BTA_spd_matrices_fill_values(const stacked_pd *B, const matrix *A, + stacked_pd *C); static void BTDA_spd_matrices_fill_values(const stacked_pd *B, const double *d, const matrix *A, stacked_pd *C); static matrix *BTA_sparse_matrices_alloc(const sparse_matrix *B, matrix *A); +static void BTA_sparse_matrices_fill_values(const sparse_matrix *B, const matrix *A, + matrix *C); static void BTDA_sparse_matrices_fill_values(const sparse_matrix *B, const double *d, const matrix *A, matrix *C); @@ -65,6 +71,23 @@ void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C) BTDA_sparse_matrices_fill_values((const sparse_matrix *) B, d, A, C); } +void BTA_matrices_fill_values(matrix *A, matrix *B, matrix *C) +{ + if (B->is_permuted_dense) + { + BTA_pd_matrices_fill_values((const permuted_dense *) B, A, + (permuted_dense *) C); + return; + } + if (B->is_stacked_pd) + { + BTA_spd_matrices_fill_values((const stacked_pd *) B, A, (stacked_pd *) C); + return; + } + /* B is sparse_matrix */ + BTA_sparse_matrices_fill_values((const sparse_matrix *) B, A, C); +} + matrix *BA_pd_matrices_alloc(const permuted_dense *B, matrix *A) { if (A->is_permuted_dense) @@ -137,6 +160,25 @@ static void BTDA_pd_matrices_fill_values(const permuted_dense *B, const double * BTDA_pd_csc_fill_values(B, d, sm_A->csc_cache, C); } +static void BTA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A, + permuted_dense *C) +{ + if (A->is_permuted_dense) + { + BTA_pd_pd_fill_values(B, (const permuted_dense *) A, C); + return; + } + if (A->is_stacked_pd) + { + BTA_pd_spd_fill_values(B, (const stacked_pd *) A, C); + return; + } + + /* A is sparse */ + const sparse_matrix *sm_A = (const sparse_matrix *) A; + BTA_pd_csc_fill_values(B, sm_A->csc_cache, C); +} + static matrix *BTA_spd_matrices_alloc(const stacked_pd *B, matrix *A) { if (A->is_permuted_dense) @@ -173,6 +215,25 @@ static void BTDA_spd_matrices_fill_values(const stacked_pd *B, const double *d, BTDA_spd_csc_fill_values(B, d, sm_A->csc_cache, C); } +static void BTA_spd_matrices_fill_values(const stacked_pd *B, const matrix *A, + stacked_pd *C) +{ + if (A->is_permuted_dense) + { + BTA_spd_pd_fill_values(B, (const permuted_dense *) A, C); + return; + } + if (A->is_stacked_pd) + { + BTA_spd_spd_fill_values(B, (const stacked_pd *) A, C); + return; + } + + /* A is sparse */ + const sparse_matrix *sm_A = (const sparse_matrix *) A; + BTA_spd_csc_fill_values(B, sm_A->csc_cache, C); +} + static matrix *BTA_sparse_matrices_alloc(const sparse_matrix *B, matrix *A) { /* Ensure B's csc_cache structure exists. */ @@ -216,6 +277,28 @@ static void BTDA_sparse_matrices_fill_values(const sparse_matrix *B, const doubl BTDA_fill_values(sm_A->csc_cache, B->csc_cache, d, ((sparse_matrix *) C)->csr); } +static void BTA_sparse_matrices_fill_values(const sparse_matrix *B, const matrix *A, + matrix *C) +{ + if (A->is_permuted_dense) + { + BTA_csc_pd_fill_values(B->csc_cache, (const permuted_dense *) A, + (permuted_dense *) C); + return; + } + if (A->is_stacked_pd) + { + BTA_csc_spd_fill_values(B->csc_cache, (const stacked_pd *) A, + (stacked_pd *) C); + return; + } + + /* A is sparse */ + const sparse_matrix *sm_A = (const sparse_matrix *) A; + BTDA_fill_values(sm_A->csc_cache, B->csc_cache, NULL, + ((sparse_matrix *) C)->csr); +} + /* Debug-only check that A is the "full" permuted_dense shape the kron helpers require: m0 == base.m, n0 == base.n, identity inner perms. Called once at alloc; not repeated at fill (caller would have to diff --git a/src/utils/permuted_dense_linalg.c b/src/utils/permuted_dense_linalg.c index dc493e7..f6a9cd2 100644 --- a/src/utils/permuted_dense_linalg.c +++ b/src/utils/permuted_dense_linalg.c @@ -415,6 +415,21 @@ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A) return C; } +/* C = B^T @ A = (B^T) A, with B^T materialized into B->kernel_dwork. The no- + diagonal analogue of BTDA_pd_csc_fill_values. */ +void BTA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A, + permuted_dense *C) +{ + /* C may be empty */ + if (C->base.nnz == 0) + { + return; + } + + A_transpose(B->kernel_dwork, B->X, B->m0, B->n0); + BA_pd_csc_fill_values(B->kernel_dwork, B->m0, B->row_inv, A, C); +} + /* C = B^T diag(d) A = (diag (d) B)^T A */ void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d, const CSC_matrix *A, permuted_dense *C) @@ -469,12 +484,12 @@ matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A) return C; } -/* Internal helper for BTDA_csc_pd_fill_values: C = B^T @ A where B is CSC - and the right operand A is supplied as a transposed-layout raw buffer - (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense +/* Internal core for BTA_csc_pd_fill_values / BTDA_csc_pd_fill_values: C = B^T @ A + where B is CSC and the right operand A is supplied as a transposed-layout raw + buffer (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense block). Transposed-output sibling of BA_pd_csc_fill_values. */ -static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m0_A, - const int *inv, permuted_dense *C) +static void BTA_csc_denseT_fill_values(const CSC_matrix *B, const double *A_T, + int m0_A, const int *inv, permuted_dense *C) { /* C[i_C, j_C] = dot(col C->row_perm[i_C] of B, row j_C of A_T). */ for (int i_C = 0; i_C < C->m0; i_C++) @@ -491,9 +506,25 @@ static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m } } +/* C = B^T @ A where B is CSC and A is permuted_dense. The csc_pd analogue of + BTDA_csc_pd_fill_values without the diagonal: transpose A's dense block into + A->kernel_dwork (the column-contiguous layout the core wants) and delegate. */ +void BTA_csc_pd_fill_values(const CSC_matrix *B, const permuted_dense *A, + permuted_dense *C) +{ + if (C->base.nnz == 0) + { + return; + } + + /* A->kernel_dwork = X_A^T, row-major shape (n0_A, m0_A). */ + A_transpose(A->kernel_dwork, A->X, A->m0, A->n0); + BTA_csc_denseT_fill_values(B, A->kernel_dwork, A->m0, A->row_inv, C); +} + /* C = B^T diag(d) A. Folds diag(d) into A's dense block (writing (diag(d_perm) X_A)^T into A->kernel_dwork) and delegates to - BTA_csc_pd_fill_values. Mirrors how BTDA_pd_csc_fill_values wraps + BTA_csc_denseT_fill_values. Mirrors how BTDA_pd_csc_fill_values wraps BA_pd_csc_fill_values. */ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d, const permuted_dense *A, permuted_dense *C) @@ -509,7 +540,7 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d, /* A->kernel_dwork = (diag(d_perm) X_A)^T, row-major shape (n0_A, m0_A). Pre-sized by BTA_csc_pd_alloc; no allocation in fill. Column j of (diag(d) X_A) lives contiguously in dwork as row j — - which is exactly the layout BTA_csc_pd_fill_values wants. */ + which is exactly the layout BTA_csc_denseT_fill_values wants. */ for (int kk = 0; kk < m0_A; kk++) { double dk = d[A->row_perm[kk]]; @@ -519,7 +550,7 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d, } } - BTA_csc_pd_fill_values(B, A->kernel_dwork, m0_A, A->row_inv, C); + BTA_csc_denseT_fill_values(B, A->kernel_dwork, m0_A, A->row_inv, C); } /* Original transpose-via-Cprime implementation of BTDA_csc_pd_fill_values. @@ -528,10 +559,10 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d, #if defined(__GNUC__) || defined(__clang__) __attribute__((unused)) #endif -static void BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B, - const double *d, - const permuted_dense *A, - permuted_dense *C) +static void +BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B, const double *d, + const permuted_dense *A, + permuted_dense *C) { if (C->base.nnz == 0) { diff --git a/src/utils/stacked_pd_linalg.c b/src/utils/stacked_pd_linalg.c index 90a098f..dc46743 100644 --- a/src/utils/stacked_pd_linalg.c +++ b/src/utils/stacked_pd_linalg.c @@ -406,12 +406,29 @@ static void wrapper_BTDA_pd_pd(const permuted_dense *Bk, const double *d, BTDA_pd_pd_fill_values(Bk, d, (const permuted_dense *) ctx, Ck); } +/* BTA per-block fill: plain B_k^T @ A, no diagonal. The shared skeleton + threads d through verbatim and never dereferences it, so the caller passes + d = NULL and we ignore it here. Calling BTA_pd_pd_fill_values directly (vs. + the BTDA wrapper) also sidesteps the per-block DA temp. */ +static void wrapper_BTA_pd_pd_fill(const permuted_dense *Bk, const double *d, + const void *ctx, permuted_dense *Ck) +{ + (void) d; + BTA_pd_pd_fill_values(Bk, (const permuted_dense *) ctx, Ck); +} + matrix *BTA_spd_pd_alloc(const stacked_pd *B, const permuted_dense *A) { return spd_blockwise_alloc_coalesce(B, B->base.n, A->base.n, wrapper_BTA_pd_pd, A); } +void BTA_spd_pd_fill_values(const stacked_pd *B, const permuted_dense *A, + stacked_pd *C) +{ + spd_blockwise_fill_coalesce_accumulate(B, NULL, A, C, wrapper_BTA_pd_pd_fill); +} + void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d, const permuted_dense *A, stacked_pd *C) { @@ -436,11 +453,26 @@ static void wrapper_BTDA_pd_csc(const permuted_dense *Bk, const double *d, BTDA_pd_csc_fill_values(Bk, d, (const CSC_matrix *) ctx, Ck); } +/* BTA per-block fill: plain B_k^T @ A, no diagonal. The skeleton forwards d + verbatim and never dereferences it, so the caller passes d = NULL and we + ignore it. */ +static void wrapper_BTA_pd_csc_fill(const permuted_dense *Bk, const double *d, + const void *ctx, permuted_dense *Ck) +{ + (void) d; + BTA_pd_csc_fill_values(Bk, (const CSC_matrix *) ctx, Ck); +} + matrix *BTA_spd_csc_alloc(const stacked_pd *B, const CSC_matrix *A) { return spd_blockwise_alloc_coalesce(B, B->base.n, A->n, wrapper_BTA_pd_csc, A); } +void BTA_spd_csc_fill_values(const stacked_pd *B, const CSC_matrix *A, stacked_pd *C) +{ + spd_blockwise_fill_coalesce_accumulate(B, NULL, A, C, wrapper_BTA_pd_csc_fill); +} + void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d, const CSC_matrix *A, stacked_pd *C) { @@ -465,12 +497,28 @@ static void wrapper_BTDA_pd_spd(const permuted_dense *Bk, const double *d, BTDA_pd_spd_fill_values(Bk, d, (const stacked_pd *) ctx, Ck); } +/* BTA per-block fill: plain B_k^T @ A_spd, no diagonal. The skeleton forwards d + verbatim and never dereferences it, so the caller passes d = NULL and we + ignore it. Calling BTA_pd_spd_fill_values directly (vs. the BTDA wrapper) also + avoids the per-block DA temp. */ +static void wrapper_BTA_pd_spd_fill(const permuted_dense *Bk, const double *d, + const void *ctx, permuted_dense *Ck) +{ + (void) d; + BTA_pd_spd_fill_values(Bk, (const stacked_pd *) ctx, Ck); +} + matrix *BTA_spd_spd_alloc(const stacked_pd *B, const stacked_pd *A) { return spd_blockwise_alloc_coalesce(B, B->base.n, A->base.n, wrapper_BTA_pd_spd, A); } +void BTA_spd_spd_fill_values(const stacked_pd *B, const stacked_pd *A, stacked_pd *C) +{ + spd_blockwise_fill_coalesce_accumulate(B, NULL, A, C, wrapper_BTA_pd_spd_fill); +} + void BTDA_spd_spd_fill_values(const stacked_pd *B, const double *d, const stacked_pd *A, stacked_pd *C) { @@ -500,11 +548,26 @@ static void wrapper_BTDA_csc_pd(const permuted_dense *Ak, const double *d, BTDA_csc_pd_fill_values((const CSC_matrix *) ctx, d, Ak, Ck); } +/* BTA per-block fill: plain B^T @ A_k, no diagonal. The skeleton forwards d + verbatim and never dereferences it, so the caller passes d = NULL and we + ignore it. */ +static void wrapper_BTA_csc_pd_fill(const permuted_dense *Ak, const double *d, + const void *ctx, permuted_dense *Ck) +{ + (void) d; + BTA_csc_pd_fill_values((const CSC_matrix *) ctx, Ak, Ck); +} + matrix *BTA_csc_spd_alloc(const CSC_matrix *B, const stacked_pd *A) { return spd_blockwise_alloc_coalesce(A, B->n, A->base.n, wrapper_BTA_csc_pd, B); } +void BTA_csc_spd_fill_values(const CSC_matrix *B, const stacked_pd *A, stacked_pd *C) +{ + spd_blockwise_fill_coalesce_accumulate(A, NULL, B, C, wrapper_BTA_csc_pd_fill); +} + void BTDA_csc_spd_fill_values(const CSC_matrix *B, const double *d, const stacked_pd *A, stacked_pd *C) { diff --git a/tests/all_tests.c b/tests/all_tests.c index b680d0f..3e7bf03 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -438,14 +438,26 @@ int main(void) mu_run_test(test_BTDA_matrices_pd_pd, tests_run); mu_run_test(test_BTDA_matrices_csr_pd, tests_run); mu_run_test(test_BTDA_matrices_pd_csr, tests_run); + mu_run_test(test_BTA_csc_pd_basic, tests_run); + mu_run_test(test_BTA_csc_pd_partial_and_excluded_cols, tests_run); + mu_run_test(test_BTA_csc_pd_empty, tests_run); mu_run_test(test_BTDA_matrices_spd_pd, tests_run); mu_run_test(test_BTDA_matrices_pd_spd, tests_run); + mu_run_test(test_BTA_pd_csc_basic, tests_run); + mu_run_test(test_BTA_pd_csc_partial_and_excluded_cols, tests_run); + mu_run_test(test_BTA_pd_csc_empty, tests_run); mu_run_test(test_BTDA_matrices_spd_spd, tests_run); mu_run_test(test_BTA_pd_spd_two_blocks_both_kept, tests_run); mu_run_test(test_BTDA_pd_spd_two_blocks_both_kept, tests_run); mu_run_test(test_BTDA_spd_pd_overlapping_cp, tests_run); + mu_run_test(test_BTA_spd_pd_overlapping_cp, tests_run); mu_run_test(test_BTDA_spd_csc_overlapping_cp, tests_run); + mu_run_test(test_BTA_spd_csc_overlapping, tests_run); + mu_run_test(test_BTA_spd_csc_block_no_overlap, tests_run); mu_run_test(test_BTDA_spd_spd_overlapping, tests_run); + mu_run_test(test_BTA_spd_spd_overlapping, tests_run); + mu_run_test(test_BTA_spd_spd_multi_A_per_block, tests_run); + mu_run_test(test_BTA_spd_spd_nonoverlapping_block, tests_run); mu_run_test(test_BTA_spd_matrices_pd_A, tests_run); mu_run_test(test_BTA_spd_matrices_csc_A, tests_run); mu_run_test(test_BTA_spd_matrices_spd_A, tests_run); @@ -453,9 +465,15 @@ int main(void) mu_run_test(test_BTA_pd_matrices_csc_A, tests_run); mu_run_test(test_BTA_pd_matrices_spd_A, tests_run); mu_run_test(test_BTDA_csc_spd_overlapping, tests_run); + mu_run_test(test_BTA_csc_spd_overlapping, tests_run); + mu_run_test(test_BTA_csc_spd_block_no_overlap, tests_run); mu_run_test(test_BTA_sparse_matrices_pd_A, tests_run); mu_run_test(test_BTA_sparse_matrices_csc_A, tests_run); mu_run_test(test_BTA_sparse_matrices_spd_A, tests_run); + mu_run_test(test_BTA_matrices_fill_pd_spd, tests_run); + mu_run_test(test_BTA_matrices_fill_spd_csc, tests_run); + mu_run_test(test_BTA_matrices_fill_csc_pd, tests_run); + mu_run_test(test_BTA_matrices_fill_csc_csc, tests_run); mu_run_test(test_BA_pd_kron_spd_no_cache_staleness, tests_run); mu_run_test(test_stacked_pd_construct_and_free, tests_run); mu_run_test(test_coalesce_no_overlap, tests_run); diff --git a/tests/utils/test_matmul_dispatchers.h b/tests/utils/test_matmul_dispatchers.h index 5620574..514ca02 100644 --- a/tests/utils/test_matmul_dispatchers.h +++ b/tests/utils/test_matmul_dispatchers.h @@ -194,6 +194,367 @@ static matrix *spd_to_sparse_matrix_copy(matrix *spd_m) return new_sparse_matrix(dst); } +/* Primitive BTA_csc_pd kernel (no diagonal): C = B^T @ A, B is CSC, A is PD. + Route 1 calls BTA_csc_pd_alloc + BTA_csc_pd_fill_values directly. The oracle + flattens BOTH operands to sparse_matrix and runs the dispatcher with d = ones, + which routes through the csc/csc sparse_dot path — sharing no code with the + csc_pd kernel. Both compared via to_csr. */ +const char *test_BTA_csc_pd_basic(void) +{ + /* A: 4x5 PD, row_perm = [1,3], col_perm = [0,2] (both permuted, so row_inv + genuinely permutes and the transpose matters). */ + int row_perm_A[2] = {1, 3}; + int col_perm_A[2] = {0, 2}; + double XA[4] = {1.0, 2.0, 3.0, 4.0}; + matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA); + + /* B: 4x4 CSR -> CSC. Rows: 0:{0,2} 1:{1} 2:{0} 3:{3}. */ + int Bp[5] = {0, 2, 3, 4, 5}; + int Bi[5] = {0, 2, 1, 0, 3}; + double Bx[5] = {10.0, 20.0, 30.0, 40.0, 50.0}; + + CSR_matrix *B_csr = new_CSR_matrix(4, 4, 5); + memcpy(B_csr->p, Bp, sizeof Bp); + memcpy(B_csr->i, Bi, sizeof Bi); + memcpy(B_csr->x, Bx, sizeof Bx); + int *iwork = (int *) malloc(MAX(B_csr->m, B_csr->n) * sizeof(int)); + CSC_matrix *B_csc = csr_to_csc_alloc(B_csr, iwork); + csr_to_csc_fill_values(B_csr, B_csc, iwork); + + /* Route 1: our kernel. */ + matrix *C_ours = BTA_csc_pd_alloc(B_csc, (permuted_dense *) A_m); + BTA_csc_pd_fill_values(B_csc, (permuted_dense *) A_m, (permuted_dense *) C_ours); + + /* Oracle: both operands sparse, d = ones, csc/csc dispatch. */ + matrix *A_sparse = spd_to_sparse_matrix_copy(A_m); + CSR_matrix *B_csr2 = new_CSR_matrix(4, 4, 5); + memcpy(B_csr2->p, Bp, sizeof Bp); + memcpy(B_csr2->i, Bi, sizeof Bi); + memcpy(B_csr2->x, Bx, sizeof Bx); + matrix *B_sparse = new_sparse_matrix(B_csr2); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(A_sparse); + free_matrix(C_ours); + free_CSC_matrix(B_csc); + free_CSR_matrix(B_csr); + free(iwork); + free_matrix(A_m); + return 0; +} + +/* BTA_csc_pd corner case: B columns that (a) hit no A-row -> excluded from C's + row_perm by BTA_csc_pd_alloc; (b) partially overlap A's rows (some B->i map to + row_inv == -1) -> exercises sparse_dot_dense's -1 filtering; (c) fully land in + A's rows. A->row_perm = {1,3}. B columns by source rows: col0:{0,2} (excluded), + col1:{1} (clean), col2:{0,1,3} (partial), col3:{3} (clean). */ +const char *test_BTA_csc_pd_partial_and_excluded_cols(void) +{ + int row_perm_A[2] = {1, 3}; + int col_perm_A[2] = {0, 2}; + double XA[4] = {1.0, 2.0, 3.0, 4.0}; + matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA); + + /* B: 4x4. Rows: 0:{0,2} 1:{1,2} 2:{0} 3:{2,3}. */ + int Bp[5] = {0, 2, 4, 5, 7}; + int Bi[7] = {0, 2, 1, 2, 0, 2, 3}; + double Bx[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}; + + CSR_matrix *B_csr = new_CSR_matrix(4, 4, 7); + memcpy(B_csr->p, Bp, sizeof Bp); + memcpy(B_csr->i, Bi, sizeof Bi); + memcpy(B_csr->x, Bx, sizeof Bx); + int *iwork = (int *) malloc(MAX(B_csr->m, B_csr->n) * sizeof(int)); + CSC_matrix *B_csc = csr_to_csc_alloc(B_csr, iwork); + csr_to_csc_fill_values(B_csr, B_csc, iwork); + + matrix *C_ours = BTA_csc_pd_alloc(B_csc, (permuted_dense *) A_m); + BTA_csc_pd_fill_values(B_csc, (permuted_dense *) A_m, (permuted_dense *) C_ours); + + matrix *A_sparse = spd_to_sparse_matrix_copy(A_m); + CSR_matrix *B_csr2 = new_CSR_matrix(4, 4, 7); + memcpy(B_csr2->p, Bp, sizeof Bp); + memcpy(B_csr2->i, Bi, sizeof Bi); + memcpy(B_csr2->x, Bx, sizeof Bx); + matrix *B_sparse = new_sparse_matrix(B_csr2); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(A_sparse); + free_matrix(C_ours); + free_CSC_matrix(B_csc); + free_CSR_matrix(B_csr); + free(iwork); + free_matrix(A_m); + return 0; +} + +/* BTA_csc_pd corner case: every B column misses A's rows -> C->base.nnz == 0 + early-out. A->row_perm = {1,3}; B has nonzeros only in rows {0,2}. */ +const char *test_BTA_csc_pd_empty(void) +{ + int row_perm_A[2] = {1, 3}; + int col_perm_A[2] = {0, 2}; + double XA[4] = {1.0, 2.0, 3.0, 4.0}; + matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA); + + /* B: 4x4, nonzeros only in rows 0 and 2. Rows: 0:{0,1} 2:{2,3}. */ + int Bp[5] = {0, 2, 2, 4, 4}; + int Bi[4] = {0, 1, 2, 3}; + double Bx[4] = {1.0, 2.0, 3.0, 4.0}; + + CSR_matrix *B_csr = new_CSR_matrix(4, 4, 4); + memcpy(B_csr->p, Bp, sizeof Bp); + memcpy(B_csr->i, Bi, sizeof Bi); + memcpy(B_csr->x, Bx, sizeof Bx); + int *iwork = (int *) malloc(MAX(B_csr->m, B_csr->n) * sizeof(int)); + CSC_matrix *B_csc = csr_to_csc_alloc(B_csr, iwork); + csr_to_csc_fill_values(B_csr, B_csc, iwork); + + matrix *C_ours = BTA_csc_pd_alloc(B_csc, (permuted_dense *) A_m); + BTA_csc_pd_fill_values(B_csc, (permuted_dense *) A_m, (permuted_dense *) C_ours); + + matrix *A_sparse = spd_to_sparse_matrix_copy(A_m); + CSR_matrix *B_csr2 = new_CSR_matrix(4, 4, 4); + memcpy(B_csr2->p, Bp, sizeof Bp); + memcpy(B_csr2->i, Bi, sizeof Bi); + memcpy(B_csr2->x, Bx, sizeof Bx); + matrix *B_sparse = new_sparse_matrix(B_csr2); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("nnz zero", csr_ours->nnz == 0); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(A_sparse); + free_matrix(C_ours); + free_CSC_matrix(B_csc); + free_CSR_matrix(B_csr); + free(iwork); + free_matrix(A_m); + return 0; +} + +/* Primitive BTA_pd_csc kernel (no diagonal): C = B^T @ A, B is PD, A is CSC. + Mirror of the csc_pd tests with roles swapped. Route 1 calls BTA_pd_csc_alloc + + BTA_pd_csc_fill_values. Oracle flattens BOTH operands to sparse_matrix and + runs the dispatcher with d = ones (csc/csc sparse_dot path), sharing no code + with the pd_csc kernel. Both compared via to_csr. */ +const char *test_BTA_pd_csc_basic(void) +{ + /* B: 4x5 PD, row_perm = [1,3], col_perm = [0,2]. */ + int row_perm_B[2] = {1, 3}; + int col_perm_B[2] = {0, 2}; + double XB[4] = {1.0, 2.0, 3.0, 4.0}; + matrix *B_m = new_permuted_dense(4, 5, 2, 2, row_perm_B, col_perm_B, XB); + + /* A: 4x4 CSR -> CSC. Rows: 0:{1} 1:{0,2} 2:{3} 3:{1,3}. All A-columns + overlap B's rows {1,3} (some partially). */ + int Ap[5] = {0, 1, 3, 4, 6}; + int Ai[6] = {1, 0, 2, 3, 1, 3}; + double Ax[6] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0}; + + CSR_matrix *A_csr = new_CSR_matrix(4, 4, 6); + memcpy(A_csr->p, Ap, sizeof Ap); + memcpy(A_csr->i, Ai, sizeof Ai); + memcpy(A_csr->x, Ax, sizeof Ax); + int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int)); + CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork); + csr_to_csc_fill_values(A_csr, A_csc, iwork); + + /* Route 1: our kernel. */ + matrix *C_ours = BTA_pd_csc_alloc((permuted_dense *) B_m, A_csc); + BTA_pd_csc_fill_values((permuted_dense *) B_m, A_csc, (permuted_dense *) C_ours); + + /* Oracle: both operands sparse, d = ones, csc/csc dispatch. Dispatcher arg + order is (rightA, leftB) and computes B^T @ A, so A goes first. */ + matrix *B_sparse = spd_to_sparse_matrix_copy(B_m); + CSR_matrix *A_csr2 = new_CSR_matrix(4, 4, 6); + memcpy(A_csr2->p, Ap, sizeof Ap); + memcpy(A_csr2->i, Ai, sizeof Ai); + memcpy(A_csr2->x, Ax, sizeof Ax); + matrix *A_sparse = new_sparse_matrix(A_csr2); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(A_sparse); + free_matrix(B_sparse); + free_matrix(C_ours); + free_CSC_matrix(A_csc); + free_CSR_matrix(A_csr); + free(iwork); + free_matrix(B_m); + return 0; +} + +/* BTA_pd_csc corner case: A columns that (a) hit no B-row -> excluded from C's + col_perm by BTA_pd_csc_alloc; (b) partially overlap B's rows (some A->i map to + row_inv == -1) -> exercises sparse_dot_dense's -1 filtering; (c) fully land in + B's rows. B->row_perm = {1,3}. A columns by source rows: col0:{0,2} (excluded), + col1:{1} (clean), col2:{0,1,3} (partial), col3:{3} (clean). */ +const char *test_BTA_pd_csc_partial_and_excluded_cols(void) +{ + int row_perm_B[2] = {1, 3}; + int col_perm_B[2] = {0, 2}; + double XB[4] = {1.0, 2.0, 3.0, 4.0}; + matrix *B_m = new_permuted_dense(4, 5, 2, 2, row_perm_B, col_perm_B, XB); + + /* A: 4x4. Rows: 0:{0,2} 1:{1,2} 2:{0} 3:{2,3}. */ + int Ap[5] = {0, 2, 4, 5, 7}; + int Ai[7] = {0, 2, 1, 2, 0, 2, 3}; + double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}; + + CSR_matrix *A_csr = new_CSR_matrix(4, 4, 7); + memcpy(A_csr->p, Ap, sizeof Ap); + memcpy(A_csr->i, Ai, sizeof Ai); + memcpy(A_csr->x, Ax, sizeof Ax); + int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int)); + CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork); + csr_to_csc_fill_values(A_csr, A_csc, iwork); + + matrix *C_ours = BTA_pd_csc_alloc((permuted_dense *) B_m, A_csc); + BTA_pd_csc_fill_values((permuted_dense *) B_m, A_csc, (permuted_dense *) C_ours); + + matrix *B_sparse = spd_to_sparse_matrix_copy(B_m); + CSR_matrix *A_csr2 = new_CSR_matrix(4, 4, 7); + memcpy(A_csr2->p, Ap, sizeof Ap); + memcpy(A_csr2->i, Ai, sizeof Ai); + memcpy(A_csr2->x, Ax, sizeof Ax); + matrix *A_sparse = new_sparse_matrix(A_csr2); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(A_sparse); + free_matrix(B_sparse); + free_matrix(C_ours); + free_CSC_matrix(A_csc); + free_CSR_matrix(A_csr); + free(iwork); + free_matrix(B_m); + return 0; +} + +/* BTA_pd_csc corner case: every A column misses B's rows -> C->base.nnz == 0 + early-out. B->row_perm = {1,3}; A has nonzeros only in rows {0,2}. */ +const char *test_BTA_pd_csc_empty(void) +{ + int row_perm_B[2] = {1, 3}; + int col_perm_B[2] = {0, 2}; + double XB[4] = {1.0, 2.0, 3.0, 4.0}; + matrix *B_m = new_permuted_dense(4, 5, 2, 2, row_perm_B, col_perm_B, XB); + + /* A: 4x4, nonzeros only in rows 0 and 2. Rows: 0:{0,1} 2:{2,3}. */ + int Ap[5] = {0, 2, 2, 4, 4}; + int Ai[4] = {0, 1, 2, 3}; + double Ax[4] = {1.0, 2.0, 3.0, 4.0}; + + CSR_matrix *A_csr = new_CSR_matrix(4, 4, 4); + memcpy(A_csr->p, Ap, sizeof Ap); + memcpy(A_csr->i, Ai, sizeof Ai); + memcpy(A_csr->x, Ax, sizeof Ax); + int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int)); + CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork); + csr_to_csc_fill_values(A_csr, A_csc, iwork); + + matrix *C_ours = BTA_pd_csc_alloc((permuted_dense *) B_m, A_csc); + BTA_pd_csc_fill_values((permuted_dense *) B_m, A_csc, (permuted_dense *) C_ours); + + matrix *B_sparse = spd_to_sparse_matrix_copy(B_m); + CSR_matrix *A_csr2 = new_CSR_matrix(4, 4, 4); + memcpy(A_csr2->p, Ap, sizeof Ap); + memcpy(A_csr2->i, Ai, sizeof Ai); + memcpy(A_csr2->x, Ax, sizeof Ax); + matrix *A_sparse = new_sparse_matrix(A_csr2); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("nnz zero", csr_ours->nnz == 0); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + + free_matrix(C_ref); + free_matrix(A_sparse); + free_matrix(B_sparse); + free_matrix(C_ours); + free_CSC_matrix(A_csc); + free_CSR_matrix(A_csr); + free(iwork); + free_matrix(B_m); + return 0; +} + /* Wrapper dispatch: (A=spd, B=PD). The spd-fallback path goes through BTA_csc_pd_alloc / BTDA_csc_pd_fill_values; result must match the equivalent (A=sparse, B=PD) dispatch on the same matrix. */ @@ -561,6 +922,63 @@ const char *test_BTDA_spd_pd_overlapping_cp(void) return 0; } +/* Primitive BTA_spd_pd kernel (no diagonal). Same matrices as the BTDA + variant above. Reference is the production dispatcher with B flattened to a + sparse_matrix and d = ones, which equals plain B^T @ A. */ +const char *test_BTA_spd_pd_overlapping_cp(void) +{ + /* B: 4x3 spd, two blocks with overlapping col_perms (share col 2). + blk0: rows {0,1}, cols {0,2}, X = [[1,2],[3,4]] + blk1: rows {2,3}, cols {1,2}, X = [[5,6],[7,8]] */ + int B0_rp[2] = {0, 1}; + int B0_cp[2] = {0, 2}; + double B0X[4] = {1, 2, 3, 4}; + matrix *blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X); + int B1_rp[2] = {2, 3}; + int B1_cp[2] = {1, 2}; + double B1X[4] = {5, 6, 7, 8}; + matrix *blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X); + permuted_dense *B_blocks[2] = {(permuted_dense *) blk0, (permuted_dense *) blk1}; + matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL); + + /* A: 4x3 PD, full row_perm, full col_perm. */ + int A_rp[4] = {0, 1, 2, 3}; + int A_cp[3] = {0, 1, 2}; + double AX[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + matrix *A = new_permuted_dense(4, 3, 4, 3, A_rp, A_cp, AX); + + /* Route 1: our new BTA_spd_pd_fill_values. */ + matrix *C_ours = BTA_spd_pd_alloc((stacked_pd *) B_spd, (permuted_dense *) A); + BTA_spd_pd_fill_values((stacked_pd *) B_spd, (permuted_dense *) A, + (stacked_pd *) C_ours); + + /* Route 2: dispatcher with B flattened to sparse_matrix and d = ones, so + BTDA == BTA. Note BTA_matrices_alloc(A, B) computes B^T @ A, so A goes + first. */ + matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd); + matrix *C_ref = BTA_matrices_alloc(A, B_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A, d_ones, B_sparse, C_ref); + + /* C_ours is stacked_pd, C_ref is permuted_dense — compare via to_csr. */ + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(C_ours); + free_matrix(A); + free_matrix(B_spd); + return 0; +} + /* Primitive BTDA_spd_csc kernel (ATA-style direct: per-block BTDA_pd_csc + accumulating coalesce). C = B^T @ diag(d) @ A with B a 2-block stacked_pd whose col_perms share column 2, and A a sparse_matrix @@ -636,47 +1054,169 @@ const char *test_BTDA_spd_csc_overlapping_cp(void) return 0; } -/* Primitive BTDA_spd_spd kernel (per-block BTDA_pd_spd + accumulating coalesce). - C = B^T @ diag(d) @ A with both B and A 2-block stacked_pds. B's c_k's - share col 2 (outer coalesce-accumulate path), and A's col_perms also - share at least one column (inner BTA_pd_spd accumulate path). Reference - is the production dispatcher with both operands flattened to sparse_matrix. - Output types differ (stacked_pd vs sparse_matrix); compare via to_csr. */ -const char *test_BTDA_spd_spd_overlapping(void) +/* Primitive BTA_spd_csc kernel (no diagonal): C = B^T @ A, B is a 2-block + stacked_pd whose col_perms share a column (accumulating coalesce fires), A is + CSC. Same matrices as the BTDA variant above. Oracle is the dispatcher with B + flattened to sparse and d = ones (csc/csc sparse_dot path). */ +const char *test_BTA_spd_csc_overlapping(void) { - /* B: 4x3 spd, two blocks with overlapping c_k(B) (share col 2). */ + /* B: 4x3 spd, two blocks with overlapping col_perms (share col 2). */ int B0_rp[2] = {0, 1}; int B0_cp[2] = {0, 2}; double B0X[4] = {1, 2, 3, 4}; - matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X); + matrix *blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X); int B1_rp[2] = {2, 3}; int B1_cp[2] = {1, 2}; double B1X[4] = {5, 6, 7, 8}; - matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X); - permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0, - (permuted_dense *) B_blk1}; + matrix *blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X); + permuted_dense *B_blocks[2] = {(permuted_dense *) blk0, (permuted_dense *) blk1}; matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL); - /* A: 4x3 spd, two blocks. A_0's rows overlap B_0's, A_1's rows overlap - B_1's. A_0 and A_1 share col 1 → inner BTA_pd_spd accumulate fires. */ - int A0_rp[2] = {0, 1}; - int A0_cp[2] = {0, 1}; - double A0X[4] = {10, 11, 12, 13}; - matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X); - int A1_rp[2] = {2, 3}; - int A1_cp[2] = {1, 2}; - double A1X[4] = {20, 21, 22, 23}; - matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X); - permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0, - (permuted_dense *) A_blk1}; - matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL); - - double d[4] = {2.0, -1.5, 0.5, 1.25}; + /* A: 4x3 sparse_matrix wrapping a CSR. Rows: 0:{0,2} 1:{1} 2:{0,1,2} 3:{2}. */ + CSR_matrix *A_csr = new_CSR_matrix(4, 3, 7); + int Ap[5] = {0, 2, 3, 6, 7}; + int Ai[7] = {0, 2, 1, 0, 1, 2, 2}; + double Ax[7] = {1, 2, 3, 4, 5, 6, 7}; + memcpy(A_csr->p, Ap, sizeof(Ap)); + memcpy(A_csr->i, Ai, sizeof(Ai)); + memcpy(A_csr->x, Ax, sizeof(Ax)); + matrix *A_sm = new_sparse_matrix(A_csr); - /* Route 1: new kernel. */ - matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd); - BTDA_spd_spd_fill_values((stacked_pd *) B_spd, d, (stacked_pd *) A_spd, - (stacked_pd *) C_ours); + /* Route 1: new kernel. Need A's csc_cache. */ + sparse_matrix_ensure_csc_cache((sparse_matrix *) A_sm); + A_sm->refresh_csc_values(A_sm); + matrix *C_ours = + BTA_spd_csc_alloc((stacked_pd *) B_spd, ((sparse_matrix *) A_sm)->csc_cache); + BTA_spd_csc_fill_values((stacked_pd *) B_spd, + ((sparse_matrix *) A_sm)->csc_cache, + (stacked_pd *) C_ours); + + /* Route 2: dispatcher with B flattened to sparse and d = ones, so BTDA == BTA. + BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */ + matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd); + matrix *C_ref = BTA_matrices_alloc(A_sm, B_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sm, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(C_ours); + free_matrix(A_sm); + free_matrix(B_spd); + return 0; +} + +/* BTA_spd_csc corner case: one B-block whose rows match no nonzero row of A -> + that block's per-block partial is empty (m0 = 0), while the other block + contributes. Exercises the empty-source-block path through BTA_pd_csc_alloc + and the accumulating coalesce. */ +const char *test_BTA_spd_csc_block_no_overlap(void) +{ + /* B: 4x3 spd, two blocks. blk0 rows {0,1}; blk1 rows {2,3}. */ + int B0_rp[2] = {0, 1}; + int B0_cp[2] = {0, 1}; + double B0X[4] = {1, 2, 3, 4}; + matrix *blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X); + int B1_rp[2] = {2, 3}; + int B1_cp[2] = {1, 2}; + double B1X[4] = {5, 6, 7, 8}; + matrix *blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X); + permuted_dense *B_blocks[2] = {(permuted_dense *) blk0, (permuted_dense *) blk1}; + matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL); + + /* A: 4x3, nonzeros only in rows {0,1} -> overlaps blk0 only; blk1's rows + {2,3} hit no A nonzero -> empty partial. Rows: 0:{0,2} 1:{1}. */ + CSR_matrix *A_csr = new_CSR_matrix(4, 3, 3); + int Ap[5] = {0, 2, 3, 3, 3}; + int Ai[3] = {0, 2, 1}; + double Ax[3] = {1, 2, 3}; + memcpy(A_csr->p, Ap, sizeof(Ap)); + memcpy(A_csr->i, Ai, sizeof(Ai)); + memcpy(A_csr->x, Ax, sizeof(Ax)); + matrix *A_sm = new_sparse_matrix(A_csr); + + sparse_matrix_ensure_csc_cache((sparse_matrix *) A_sm); + A_sm->refresh_csc_values(A_sm); + matrix *C_ours = + BTA_spd_csc_alloc((stacked_pd *) B_spd, ((sparse_matrix *) A_sm)->csc_cache); + BTA_spd_csc_fill_values((stacked_pd *) B_spd, + ((sparse_matrix *) A_sm)->csc_cache, + (stacked_pd *) C_ours); + + matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd); + matrix *C_ref = BTA_matrices_alloc(A_sm, B_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sm, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(C_ours); + free_matrix(A_sm); + free_matrix(B_spd); + return 0; +} + +/* Primitive BTDA_spd_spd kernel (per-block BTDA_pd_spd + accumulating coalesce). + C = B^T @ diag(d) @ A with both B and A 2-block stacked_pds. B's c_k's + share col 2 (outer coalesce-accumulate path), and A's col_perms also + share at least one column (inner BTA_pd_spd accumulate path). Reference + is the production dispatcher with both operands flattened to sparse_matrix. + Output types differ (stacked_pd vs sparse_matrix); compare via to_csr. */ +const char *test_BTDA_spd_spd_overlapping(void) +{ + /* B: 4x3 spd, two blocks with overlapping c_k(B) (share col 2). */ + int B0_rp[2] = {0, 1}; + int B0_cp[2] = {0, 2}; + double B0X[4] = {1, 2, 3, 4}; + matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X); + int B1_rp[2] = {2, 3}; + int B1_cp[2] = {1, 2}; + double B1X[4] = {5, 6, 7, 8}; + matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X); + permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0, + (permuted_dense *) B_blk1}; + matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL); + + /* A: 4x3 spd, two blocks. A_0's rows overlap B_0's, A_1's rows overlap + B_1's. A_0 and A_1 share col 1 → inner BTA_pd_spd accumulate fires. */ + int A0_rp[2] = {0, 1}; + int A0_cp[2] = {0, 1}; + double A0X[4] = {10, 11, 12, 13}; + matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X); + int A1_rp[2] = {2, 3}; + int A1_cp[2] = {1, 2}; + double A1X[4] = {20, 21, 22, 23}; + matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X); + permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0, + (permuted_dense *) A_blk1}; + matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL); + + double d[4] = {2.0, -1.5, 0.5, 1.25}; + + /* Route 1: new kernel. */ + matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd); + BTDA_spd_spd_fill_values((stacked_pd *) B_spd, d, (stacked_pd *) A_spd, + (stacked_pd *) C_ours); /* Route 2: dispatcher with both flattened to sparse_matrix. BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */ @@ -706,6 +1246,188 @@ const char *test_BTDA_spd_spd_overlapping(void) return 0; } +/* Primitive BTA_spd_spd kernel (no diagonal). Same matrices as the BTDA + variant above. Reference is the production dispatcher with both operands + flattened to sparse_matrix and d = ones, which equals plain B^T @ A. */ +const char *test_BTA_spd_spd_overlapping(void) +{ + /* B: 4x3 spd, two blocks with overlapping c_k(B) (share col 2). */ + int B0_rp[2] = {0, 1}; + int B0_cp[2] = {0, 2}; + double B0X[4] = {1, 2, 3, 4}; + matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X); + int B1_rp[2] = {2, 3}; + int B1_cp[2] = {1, 2}; + double B1X[4] = {5, 6, 7, 8}; + matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X); + permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0, + (permuted_dense *) B_blk1}; + matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL); + + /* A: 4x3 spd, two blocks. A_0's rows overlap B_0's, A_1's rows overlap + B_1's. A_0 and A_1 share col 1. */ + int A0_rp[2] = {0, 1}; + int A0_cp[2] = {0, 1}; + double A0X[4] = {10, 11, 12, 13}; + matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X); + int A1_rp[2] = {2, 3}; + int A1_cp[2] = {1, 2}; + double A1X[4] = {20, 21, 22, 23}; + matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X); + permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0, + (permuted_dense *) A_blk1}; + matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL); + + /* Route 1: new kernel. */ + matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd); + BTA_spd_spd_fill_values((stacked_pd *) B_spd, (stacked_pd *) A_spd, + (stacked_pd *) C_ours); + + /* Route 2: dispatcher with both flattened to sparse_matrix and d = ones, so + BTDA == BTA. BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */ + matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd); + matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + /* C_ours is stacked_pd, C_ref is sparse_matrix — compare via to_csr. */ + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(A_sparse); + free_matrix(C_ours); + free_matrix(A_spd); + free_matrix(B_spd); + return 0; +} + +/* BTA_spd_spd corner case: a single B-block whose rows span MULTIPLE A-blocks, + forcing BTA_pd_spd_fill_values' inner += accumulation over A-blocks into one + partial, with overlapping output columns (A_0 and A_1 share col 1). Not + exercised by test_BTA_spd_spd_overlapping (there each B-block overlaps exactly + one A-block). */ +const char *test_BTA_spd_spd_multi_A_per_block(void) +{ + /* B: 4x2, one block spanning all four contraction rows. */ + int B0_rp[4] = {0, 1, 2, 3}; + int B0_cp[2] = {0, 1}; + double B0X[8] = {1, 2, 3, 4, 5, 6, 7, 8}; + matrix *B_blk0 = new_permuted_dense(4, 2, 4, 2, B0_rp, B0_cp, B0X); + permuted_dense *B_blocks[1] = {(permuted_dense *) B_blk0}; + matrix *B_spd = new_stacked_pd(4, 2, 1, B_blocks, NULL, NULL); + + /* A: 4x3, two blocks with disjoint rows but overlapping cols (share col 1). + Both overlap B_0's rows, so both contribute to B_0's single partial. */ + int A0_rp[2] = {0, 1}; + int A0_cp[2] = {0, 1}; + double A0X[4] = {10, 11, 12, 13}; + matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X); + int A1_rp[2] = {2, 3}; + int A1_cp[2] = {1, 2}; + double A1X[4] = {20, 21, 22, 23}; + matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X); + permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0, + (permuted_dense *) A_blk1}; + matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL); + + matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd); + BTA_spd_spd_fill_values((stacked_pd *) B_spd, (stacked_pd *) A_spd, + (stacked_pd *) C_ours); + + matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd); + matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(A_sparse); + free_matrix(C_ours); + free_matrix(A_spd); + free_matrix(B_spd); + return 0; +} + +/* BTA_spd_spd corner case: a B-block that overlaps NO A-block, producing an + empty partial (n0 = 0) that coalesces to a zero-column output block (output + row 1, all zeros). Exercises the partial-level nnz == 0 early-out in + BTA_pd_spd_fill_values and the empty-block path through coalesce/to_csr. */ +const char *test_BTA_spd_spd_nonoverlapping_block(void) +{ + /* B: 4x2, two blocks. B_0 rows {0,1}; B_1 rows {2,3}. */ + int B0_rp[2] = {0, 1}; + int B0_cp[1] = {0}; + double B0X[2] = {1, 2}; + matrix *B_blk0 = new_permuted_dense(4, 2, 2, 1, B0_rp, B0_cp, B0X); + int B1_rp[2] = {2, 3}; + int B1_cp[1] = {1}; + double B1X[2] = {3, 4}; + matrix *B_blk1 = new_permuted_dense(4, 2, 2, 1, B1_rp, B1_cp, B1X); + permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0, + (permuted_dense *) B_blk1}; + matrix *B_spd = new_stacked_pd(4, 2, 2, B_blocks, NULL, NULL); + + /* A: 4x2, one block on rows {0,1}. B_0 overlaps it; B_1 does not. */ + int A0_rp[2] = {0, 1}; + int A0_cp[2] = {0, 1}; + double A0X[4] = {10, 11, 12, 13}; + matrix *A_blk0 = new_permuted_dense(4, 2, 2, 2, A0_rp, A0_cp, A0X); + permuted_dense *A_blocks[1] = {(permuted_dense *) A_blk0}; + matrix *A_spd = new_stacked_pd(4, 2, 1, A_blocks, NULL, NULL); + + matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd); + BTA_spd_spd_fill_values((stacked_pd *) B_spd, (stacked_pd *) A_spd, + (stacked_pd *) C_ours); + + matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd); + matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse); + A_sparse->refresh_csc_values(A_sparse); + B_sparse->refresh_csc_values(B_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(B_sparse); + free_matrix(A_sparse); + free_matrix(C_ours); + free_matrix(A_spd); + free_matrix(B_spd); + return 0; +} + /* BTA_spd_matrices dispatcher: A is permuted_dense. Verifies the PD-branch routes to BTA_spd_pd / BTDA_spd_pd. */ const char *test_BTA_spd_matrices_pd_A(void) @@ -1121,6 +1843,128 @@ const char *test_BTDA_csc_spd_overlapping(void) return 0; } +/* Primitive BTA_csc_spd kernel (no diagonal): C = B^T @ A, B is CSC, A is a + 2-block stacked_pd whose col_perms share a column (so the accumulating coalesce + path fires). Same matrices as the BTDA variant above. Oracle is the dispatcher + with A flattened to sparse and d = ones (csc/csc sparse_dot path). */ +const char *test_BTA_csc_spd_overlapping(void) +{ + /* A: 4x3 spd, two blocks with overlapping col_perms (share col 2). */ + int A0_rp[2] = {0, 1}; + int A0_cp[2] = {0, 2}; + double A0X[4] = {1, 2, 3, 4}; + matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X); + int A1_rp[2] = {2, 3}; + int A1_cp[2] = {1, 2}; + double A1X[4] = {5, 6, 7, 8}; + matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X); + permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0, + (permuted_dense *) A_blk1}; + matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL); + + /* B: 4x3 sparse_matrix wrapping a CSR. */ + CSR_matrix *B_csr = new_CSR_matrix(4, 3, 7); + int Bp[5] = {0, 2, 3, 6, 7}; + int Bi[7] = {0, 2, 1, 0, 1, 2, 2}; + double Bx[7] = {1, 2, 3, 4, 5, 6, 7}; + memcpy(B_csr->p, Bp, sizeof(Bp)); + memcpy(B_csr->i, Bi, sizeof(Bi)); + memcpy(B_csr->x, Bx, sizeof(Bx)); + matrix *B_sm = new_sparse_matrix(B_csr); + + /* Route 1: new kernel. Ensure B's csc_cache and refresh values. */ + sparse_matrix_ensure_csc_cache((sparse_matrix *) B_sm); + B_sm->refresh_csc_values(B_sm); + matrix *C_ours = + BTA_csc_spd_alloc(((sparse_matrix *) B_sm)->csc_cache, (stacked_pd *) A_spd); + BTA_csc_spd_fill_values(((sparse_matrix *) B_sm)->csc_cache, + (stacked_pd *) A_spd, (stacked_pd *) C_ours); + + /* Route 2: dispatcher with A flattened to sparse_matrix and d = ones, so + BTDA == BTA. BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */ + matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sm); + A_sparse->refresh_csc_values(A_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sm, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(A_sparse); + free_matrix(C_ours); + free_matrix(B_sm); + free_matrix(A_spd); + return 0; +} + +/* BTA_csc_spd corner case: one A-block whose row_perm matches no nonzero row of + any B column -> that block's per-block partial is empty (m0 = 0), while the + other block contributes. Exercises the empty-source-block path through + BTA_csc_pd_alloc and the accumulating coalesce. */ +const char *test_BTA_csc_spd_block_no_overlap(void) +{ + /* A: 4x3 spd, two blocks. blk0 rows {0,1}; blk1 rows {2,3}. */ + int A0_rp[2] = {0, 1}; + int A0_cp[2] = {0, 1}; + double A0X[4] = {1, 2, 3, 4}; + matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X); + int A1_rp[2] = {2, 3}; + int A1_cp[2] = {1, 2}; + double A1X[4] = {5, 6, 7, 8}; + matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X); + permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0, + (permuted_dense *) A_blk1}; + matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL); + + /* B: 4x3, nonzeros only in rows {0,1} -> overlaps blk0 only; blk1's rows + {2,3} hit no B column -> empty partial for blk1. Rows: 0:{0,2} 1:{1}. */ + CSR_matrix *B_csr = new_CSR_matrix(4, 3, 3); + int Bp[5] = {0, 2, 3, 3, 3}; + int Bi[3] = {0, 2, 1}; + double Bx[3] = {1, 2, 3}; + memcpy(B_csr->p, Bp, sizeof(Bp)); + memcpy(B_csr->i, Bi, sizeof(Bi)); + memcpy(B_csr->x, Bx, sizeof(Bx)); + matrix *B_sm = new_sparse_matrix(B_csr); + + sparse_matrix_ensure_csc_cache((sparse_matrix *) B_sm); + B_sm->refresh_csc_values(B_sm); + matrix *C_ours = + BTA_csc_spd_alloc(((sparse_matrix *) B_sm)->csc_cache, (stacked_pd *) A_spd); + BTA_csc_spd_fill_values(((sparse_matrix *) B_sm)->csc_cache, + (stacked_pd *) A_spd, (stacked_pd *) C_ours); + + matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd); + matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sm); + A_sparse->refresh_csc_values(A_sparse); + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + BTDA_matrices_fill_values(A_sparse, d_ones, B_sm, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(A_sparse); + free_matrix(C_ours); + free_matrix(B_sm); + free_matrix(A_spd); + return 0; +} + /* BTA_sparse_matrices dispatcher: A is permuted_dense. Both routes produce PD; compare X / perms directly. */ const char *test_BTA_sparse_matrices_pd_A(void) @@ -1322,6 +2166,203 @@ const char *test_BTA_sparse_matrices_spd_A(void) return 0; } +/* Top-level BTA_matrices_fill_values dispatcher. Each test runs Route 1 = + BTA_matrices_alloc + BTA_matrices_fill_values and Route 2 = the same alloc + + BTDA_matrices_fill_values with d = ones (B^T diag(1) A = B^T A). The two + share structure, so we compare values via to_csr. Together the four cover all + three B-branches and the special csc/csc (d == NULL) path. Recall + BTA_matrices_alloc(A, B) computes B^T @ A, so the transposed operand goes + second. */ +const char *test_BTA_matrices_fill_pd_spd(void) +{ + /* B: 4x5 PD (transposed operand). */ + int B_rp[3] = {0, 1, 2}; + int B_cp[3] = {1, 3, 4}; + double BX[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + matrix *B = new_permuted_dense(4, 5, 3, 3, B_rp, B_cp, BX); + + /* A: 4x3 spd, two blocks (share col 2). */ + int A0_rp[2] = {0, 1}; + int A0_cp[2] = {0, 2}; + double A0X[4] = {1, 2, 3, 4}; + matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X); + int A1_rp[2] = {2, 3}; + int A1_cp[2] = {1, 2}; + double A1X[4] = {5, 6, 7, 8}; + matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X); + permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0, + (permuted_dense *) A_blk1}; + matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL); + + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + + matrix *C_ours = BTA_matrices_alloc(A_spd, B); + BTA_matrices_fill_values(A_spd, B, C_ours); + + matrix *C_ref = BTA_matrices_alloc(A_spd, B); + BTDA_matrices_fill_values(A_spd, d_ones, B, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(C_ours); + free_matrix(A_spd); + free_matrix(B); + return 0; +} + +const char *test_BTA_matrices_fill_spd_csc(void) +{ + /* B: 4x3 spd, two blocks (transposed operand). */ + int B0_rp[2] = {0, 1}; + int B0_cp[2] = {0, 2}; + double B0X[4] = {1, 2, 3, 4}; + matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X); + int B1_rp[2] = {2, 3}; + int B1_cp[2] = {1, 2}; + double B1X[4] = {5, 6, 7, 8}; + matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X); + permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0, + (permuted_dense *) B_blk1}; + matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL); + + /* A: 4x3 sparse_matrix. */ + CSR_matrix *A_csr = new_CSR_matrix(4, 3, 7); + int Ap[5] = {0, 2, 3, 6, 7}; + int Ai[7] = {0, 2, 1, 0, 1, 2, 2}; + double Ax[7] = {1, 2, 3, 4, 5, 6, 7}; + memcpy(A_csr->p, Ap, sizeof(Ap)); + memcpy(A_csr->i, Ai, sizeof(Ai)); + memcpy(A_csr->x, Ax, sizeof(Ax)); + matrix *A_sm = new_sparse_matrix(A_csr); + + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + + matrix *C_ours = BTA_matrices_alloc(A_sm, B_spd); + A_sm->refresh_csc_values(A_sm); + BTA_matrices_fill_values(A_sm, B_spd, C_ours); + + matrix *C_ref = BTA_matrices_alloc(A_sm, B_spd); + A_sm->refresh_csc_values(A_sm); + BTDA_matrices_fill_values(A_sm, d_ones, B_spd, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(C_ours); + free_matrix(A_sm); + free_matrix(B_spd); + return 0; +} + +const char *test_BTA_matrices_fill_csc_pd(void) +{ + /* B: 4x4 sparse_matrix (transposed operand). */ + CSR_matrix *B_csr = new_CSR_matrix(4, 4, 5); + int Bp[5] = {0, 2, 3, 4, 5}; + int Bi[5] = {0, 2, 1, 0, 3}; + double Bx[5] = {10, 20, 30, 40, 50}; + memcpy(B_csr->p, Bp, sizeof(Bp)); + memcpy(B_csr->i, Bi, sizeof(Bi)); + memcpy(B_csr->x, Bx, sizeof(Bx)); + matrix *B_sm = new_sparse_matrix(B_csr); + + /* A: 4x3 PD. */ + int A_rp[4] = {0, 1, 2, 3}; + int A_cp[3] = {0, 1, 2}; + double AX[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + matrix *A = new_permuted_dense(4, 3, 4, 3, A_rp, A_cp, AX); + + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + + matrix *C_ours = BTA_matrices_alloc(A, B_sm); + B_sm->refresh_csc_values(B_sm); + BTA_matrices_fill_values(A, B_sm, C_ours); + + matrix *C_ref = BTA_matrices_alloc(A, B_sm); + B_sm->refresh_csc_values(B_sm); + BTDA_matrices_fill_values(A, d_ones, B_sm, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(C_ours); + free_matrix(A); + free_matrix(B_sm); + return 0; +} + +const char *test_BTA_matrices_fill_csc_csc(void) +{ + /* B: 4x3 sparse_matrix (transposed operand). */ + CSR_matrix *B_csr = new_CSR_matrix(4, 3, 7); + int Bp[5] = {0, 2, 3, 6, 7}; + int Bi[7] = {0, 2, 1, 0, 1, 2, 2}; + double Bx[7] = {1, 2, 3, 4, 5, 6, 7}; + memcpy(B_csr->p, Bp, sizeof(Bp)); + memcpy(B_csr->i, Bi, sizeof(Bi)); + memcpy(B_csr->x, Bx, sizeof(Bx)); + matrix *B_sm = new_sparse_matrix(B_csr); + + /* A: 4x4 sparse_matrix. */ + CSR_matrix *A_csr = new_CSR_matrix(4, 4, 5); + int Ap[5] = {0, 2, 3, 4, 5}; + int Ai[5] = {0, 2, 1, 0, 3}; + double Ax[5] = {10, 20, 30, 40, 50}; + memcpy(A_csr->p, Ap, sizeof(Ap)); + memcpy(A_csr->i, Ai, sizeof(Ai)); + memcpy(A_csr->x, Ax, sizeof(Ax)); + matrix *A_sm = new_sparse_matrix(A_csr); + + double d_ones[4] = {1.0, 1.0, 1.0, 1.0}; + + matrix *C_ours = BTA_matrices_alloc(A_sm, B_sm); + A_sm->refresh_csc_values(A_sm); + B_sm->refresh_csc_values(B_sm); + BTA_matrices_fill_values(A_sm, B_sm, C_ours); + + matrix *C_ref = BTA_matrices_alloc(A_sm, B_sm); + A_sm->refresh_csc_values(A_sm); + B_sm->refresh_csc_values(B_sm); + BTDA_matrices_fill_values(A_sm, d_ones, B_sm, C_ref); + + CSR_matrix *csr_ours = C_ours->to_csr(C_ours); + CSR_matrix *csr_ref = C_ref->to_csr(C_ref); + mu_assert("m", csr_ours->m == csr_ref->m); + mu_assert("n", csr_ours->n == csr_ref->n); + mu_assert("nnz", csr_ours->nnz == csr_ref->nnz); + mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1)); + mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz)); + mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz)); + + free_matrix(C_ref); + free_matrix(C_ours); + free_matrix(A_sm); + free_matrix(B_sm); + return 0; +} + /* Regression test for the kron-scratch / transpose-cache crash. Exercises BA_dense_kron_spd_fill_values with p > 1: the underlying kron_for_each_ output_block reuses a single scratch PD across iterations, mutating