diff --git a/include/utils/matmul_dispatchers.h b/include/utils/matmul_dispatchers.h
index 4f4ea28..8fd326b 100644
--- a/include/utils/matmul_dispatchers.h
+++ b/include/utils/matmul_dispatchers.h
@@ -29,8 +29,9 @@
 
    Contract: for sparse_matrix operands (A or B), the caller is responsible for
    refreshing csc_cache values via refresh_csc_values before
-   BTDA_matrices_fill_values. */
+   BTA_matrices_fill_values / BTDA_matrices_fill_values. */
 matrix *BTA_matrices_alloc(matrix *A, matrix *B);
+void BTA_matrices_fill_values(matrix *A, matrix *B, matrix *C);
 void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C);
 
 /* Polymorphic dispatchers for C = B @ A, where B is permuted_dense and A is any
diff --git a/include/utils/permuted_dense_linalg.h b/include/utils/permuted_dense_linalg.h
index b049a73..0d8de83 100644
--- a/include/utils/permuted_dense_linalg.h
+++ b/include/utils/permuted_dense_linalg.h
@@ -64,6 +64,10 @@ void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
 /* Allocate new permuted dense for C = B^T @ A where B is PD and A is CSC */
 matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);
 
+/* Fill values of C = B^T @ A where B is PD and A is CSC. */
+void BTA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
+                            permuted_dense *C);
+
 /* Fill values of C = B^T @ diag(d) @ A where B is PD and A is CSC */
 void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
                              const CSC_matrix *A, permuted_dense *C);
@@ -71,6 +75,10 @@ void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
 /* Allocate new permuted_dense for C = B^T @ A where B is Sparse CSC and A is PD. */
 matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A);
 
+/* Fill values of C = B^T @ A where B is CSC and A is PD. */
+void BTA_csc_pd_fill_values(const CSC_matrix *B, const permuted_dense *A,
+                            permuted_dense *C);
+
 /* Fill values of C = B^T @ diag(d) @ A where B is CSC and A is PD */
 void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
                              const permuted_dense *A, permuted_dense *C);
diff --git a/include/utils/stacked_pd_linalg.h b/include/utils/stacked_pd_linalg.h
index 8083fdf..2d50b27 100644
--- a/include/utils/stacked_pd_linalg.h
+++ b/include/utils/stacked_pd_linalg.h
@@ -61,6 +61,11 @@ void BTDA_pd_spd_fill_values(const permuted_dense *B, const double *d,
    permuted_dense. The output C is stacked_pd. */
 matrix *BTA_spd_pd_alloc(const stacked_pd *B, const permuted_dense *A);
 
+/* Fill values of C = B^T @ A where B is stacked_pd, A is permuted_dense, and
+   C is stacked_pd. */
+void BTA_spd_pd_fill_values(const stacked_pd *B, const permuted_dense *A,
+                            stacked_pd *C);
+
 /* Fill values of C = B^T @ diag(d) @ A where B is stacked_pd, A is
    permuted_dense, d is a global vector of length B->m, and C is stacked_pd. */
 void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d,
@@ -70,6 +75,11 @@ void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d,
    The output C is stacked_pd. */
 matrix *BTA_spd_csc_alloc(const stacked_pd *B, const CSC_matrix *A);
 
+/* Fill values of C = B^T @ A where B is stacked_pd and A is CSC, and C is
+   stacked_pd. C must be pre-allocated via BTA_spd_csc_alloc. */
+void BTA_spd_csc_fill_values(const stacked_pd *B, const CSC_matrix *A,
+                             stacked_pd *C);
+
 /* Fill values of C = B^T @ diag(d) @ A where B is stacked_pd, A is CSC,
    d is a global vector of length B->m, and C is stacked_pd. */
 void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d,
@@ -79,6 +89,11 @@ void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d,
    The output C is stacked_pd. */
 matrix *BTA_spd_spd_alloc(const stacked_pd *B, const stacked_pd *A);
 
+/* Fill values of C = B^T @ A where both B and A are stacked_pd, and C is
+   stacked_pd. C must be pre-allocated via BTA_spd_spd_alloc. */
+void BTA_spd_spd_fill_values(const stacked_pd *B, const stacked_pd *A,
+                             stacked_pd *C);
+
 /* Fill values of C = B^T @ diag(d) @ A where both B and A are stacked_pd,
    d is a global vector of length B->m, and C is stacked_pd. C must be
    pre-allocated via BTA_spd_spd_alloc. */
@@ -89,6 +104,11 @@ void BTDA_spd_spd_fill_values(const stacked_pd *B, const double *d,
    The output C is stacked_pd. */
 matrix *BTA_csc_spd_alloc(const CSC_matrix *B, const stacked_pd *A);
 
+/* Fill values of C = B^T @ A where B is CSC and A is stacked_pd, and C is
+   stacked_pd. C must be pre-allocated via BTA_csc_spd_alloc. */
+void BTA_csc_spd_fill_values(const CSC_matrix *B, const stacked_pd *A,
+                             stacked_pd *C);
+
 /* Fill values of C = B^T @ diag(d) @ A where B is CSC, A is stacked_pd,
    d is a global vector of length A->m, and C is stacked_pd. C must be
    pre-allocated via BTA_csc_spd_alloc. */
diff --git a/src/utils/matmul_dispatchers.c b/src/utils/matmul_dispatchers.c
index 72967d6..6ba31bd 100644
--- a/src/utils/matmul_dispatchers.c
+++ b/src/utils/matmul_dispatchers.c
@@ -22,14 +22,20 @@
 #include <assert.h>
 
 /* Forward declarations of the fixed-B dispatchers used internally by
-   BTA_matrices_alloc / BTDA_matrices_fill_values. */
+   BTA_matrices_alloc / BTA_matrices_fill_values / BTDA_matrices_fill_values. */
 static matrix *BTA_pd_matrices_alloc(const permuted_dense *B, matrix *A);
+static void BTA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                        permuted_dense *C);
 static void BTDA_pd_matrices_fill_values(const permuted_dense *B, const double *d,
                                          const matrix *A, permuted_dense *C);
 static matrix *BTA_spd_matrices_alloc(const stacked_pd *B, matrix *A);
+static void BTA_spd_matrices_fill_values(const stacked_pd *B, const matrix *A,
+                                         stacked_pd *C);
 static void BTDA_spd_matrices_fill_values(const stacked_pd *B, const double *d,
                                           const matrix *A, stacked_pd *C);
 static matrix *BTA_sparse_matrices_alloc(const sparse_matrix *B, matrix *A);
+static void BTA_sparse_matrices_fill_values(const sparse_matrix *B, const matrix *A,
+                                            matrix *C);
 static void BTDA_sparse_matrices_fill_values(const sparse_matrix *B, const double *d,
                                              const matrix *A, matrix *C);
 
@@ -65,6 +71,23 @@ void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
     BTDA_sparse_matrices_fill_values((const sparse_matrix *) B, d, A, C);
 }
 
+void BTA_matrices_fill_values(matrix *A, matrix *B, matrix *C)
+{
+    if (B->is_permuted_dense)
+    {
+        BTA_pd_matrices_fill_values((const permuted_dense *) B, A,
+                                    (permuted_dense *) C);
+        return;
+    }
+    if (B->is_stacked_pd)
+    {
+        BTA_spd_matrices_fill_values((const stacked_pd *) B, A, (stacked_pd *) C);
+        return;
+    }
+    /* B is sparse_matrix */
+    BTA_sparse_matrices_fill_values((const sparse_matrix *) B, A, C);
+}
+
 matrix *BA_pd_matrices_alloc(const permuted_dense *B, matrix *A)
 {
     if (A->is_permuted_dense)
@@ -137,6 +160,25 @@ static void BTDA_pd_matrices_fill_values(const permuted_dense *B, const double *
     BTDA_pd_csc_fill_values(B, d, sm_A->csc_cache, C);
 }
 
+static void BTA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
+                                        permuted_dense *C)
+{
+    if (A->is_permuted_dense)
+    {
+        BTA_pd_pd_fill_values(B, (const permuted_dense *) A, C);
+        return;
+    }
+    if (A->is_stacked_pd)
+    {
+        BTA_pd_spd_fill_values(B, (const stacked_pd *) A, C);
+        return;
+    }
+
+    /* A is sparse */
+    const sparse_matrix *sm_A = (const sparse_matrix *) A;
+    BTA_pd_csc_fill_values(B, sm_A->csc_cache, C);
+}
+
 static matrix *BTA_spd_matrices_alloc(const stacked_pd *B, matrix *A)
 {
     if (A->is_permuted_dense)
@@ -173,6 +215,25 @@ static void BTDA_spd_matrices_fill_values(const stacked_pd *B, const double *d,
     BTDA_spd_csc_fill_values(B, d, sm_A->csc_cache, C);
 }
 
+static void BTA_spd_matrices_fill_values(const stacked_pd *B, const matrix *A,
+                                         stacked_pd *C)
+{
+    if (A->is_permuted_dense)
+    {
+        BTA_spd_pd_fill_values(B, (const permuted_dense *) A, C);
+        return;
+    }
+    if (A->is_stacked_pd)
+    {
+        BTA_spd_spd_fill_values(B, (const stacked_pd *) A, C);
+        return;
+    }
+
+    /* A is sparse */
+    const sparse_matrix *sm_A = (const sparse_matrix *) A;
+    BTA_spd_csc_fill_values(B, sm_A->csc_cache, C);
+}
+
 static matrix *BTA_sparse_matrices_alloc(const sparse_matrix *B, matrix *A)
 {
     /* Ensure B's csc_cache structure exists. */
@@ -216,6 +277,28 @@ static void BTDA_sparse_matrices_fill_values(const sparse_matrix *B, const doubl
     BTDA_fill_values(sm_A->csc_cache, B->csc_cache, d, ((sparse_matrix *) C)->csr);
 }
 
+static void BTA_sparse_matrices_fill_values(const sparse_matrix *B, const matrix *A,
+                                            matrix *C)
+{
+    if (A->is_permuted_dense)
+    {
+        BTA_csc_pd_fill_values(B->csc_cache, (const permuted_dense *) A,
+                               (permuted_dense *) C);
+        return;
+    }
+    if (A->is_stacked_pd)
+    {
+        BTA_csc_spd_fill_values(B->csc_cache, (const stacked_pd *) A,
+                                (stacked_pd *) C);
+        return;
+    }
+
+    /* A is sparse */
+    const sparse_matrix *sm_A = (const sparse_matrix *) A;
+    BTDA_fill_values(sm_A->csc_cache, B->csc_cache, NULL,
+                     ((sparse_matrix *) C)->csr);
+}
+
 /* Debug-only check that A is the "full" permuted_dense shape the kron
    helpers require: m0 == base.m, n0 == base.n, identity inner perms.
    Called once at alloc; not repeated at fill (caller would have to
diff --git a/src/utils/permuted_dense_linalg.c b/src/utils/permuted_dense_linalg.c
index dc493e7..f6a9cd2 100644
--- a/src/utils/permuted_dense_linalg.c
+++ b/src/utils/permuted_dense_linalg.c
@@ -415,6 +415,21 @@ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
     return C;
 }
 
+/* C = B^T @ A = (B^T) A, with B^T materialized into B->kernel_dwork. The no-
+   diagonal analogue of BTDA_pd_csc_fill_values. */
+void BTA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
+                            permuted_dense *C)
+{
+    /* C may be empty */
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    A_transpose(B->kernel_dwork, B->X, B->m0, B->n0);
+    BA_pd_csc_fill_values(B->kernel_dwork, B->m0, B->row_inv, A, C);
+}
+
 /* C = B^T diag(d) A = (diag (d) B)^T A */
 void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
                              const CSC_matrix *A, permuted_dense *C)
@@ -469,12 +484,12 @@ matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A)
     return C;
 }
 
-/* Internal helper for BTDA_csc_pd_fill_values: C = B^T @ A where B is CSC
-   and the right operand A is supplied as a transposed-layout raw buffer
-   (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense
+/* Internal core for BTA_csc_pd_fill_values / BTDA_csc_pd_fill_values: C = B^T @ A
+   where B is CSC and the right operand A is supplied as a transposed-layout raw
+   buffer (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense
    block). Transposed-output sibling of BA_pd_csc_fill_values. */
-static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m0_A,
-                                   const int *inv, permuted_dense *C)
+static void BTA_csc_denseT_fill_values(const CSC_matrix *B, const double *A_T,
+                                       int m0_A, const int *inv, permuted_dense *C)
 {
     /* C[i_C, j_C] = dot(col C->row_perm[i_C] of B, row j_C of A_T). */
     for (int i_C = 0; i_C < C->m0; i_C++)
@@ -491,9 +506,25 @@ static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m
     }
 }
 
+/* C = B^T @ A where B is CSC and A is permuted_dense. The csc_pd analogue of
+   BTDA_csc_pd_fill_values without the diagonal: transpose A's dense block into
+   A->kernel_dwork (the column-contiguous layout the core wants) and delegate. */
+void BTA_csc_pd_fill_values(const CSC_matrix *B, const permuted_dense *A,
+                            permuted_dense *C)
+{
+    if (C->base.nnz == 0)
+    {
+        return;
+    }
+
+    /* A->kernel_dwork = X_A^T, row-major shape (n0_A, m0_A). */
+    A_transpose(A->kernel_dwork, A->X, A->m0, A->n0);
+    BTA_csc_denseT_fill_values(B, A->kernel_dwork, A->m0, A->row_inv, C);
+}
+
 /* C = B^T diag(d) A. Folds diag(d) into A's dense block (writing
    (diag(d_perm) X_A)^T into A->kernel_dwork) and delegates to
-   BTA_csc_pd_fill_values. Mirrors how BTDA_pd_csc_fill_values wraps
+   BTA_csc_denseT_fill_values. Mirrors how BTDA_pd_csc_fill_values wraps
    BA_pd_csc_fill_values. */
 void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
                              const permuted_dense *A, permuted_dense *C)
@@ -509,7 +540,7 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
     /* A->kernel_dwork = (diag(d_perm) X_A)^T, row-major shape (n0_A, m0_A).
        Pre-sized by BTA_csc_pd_alloc; no allocation in fill.
        Column j of (diag(d) X_A) lives contiguously in dwork as row j —
-       which is exactly the layout BTA_csc_pd_fill_values wants. */
+       which is exactly the layout BTA_csc_denseT_fill_values wants. */
     for (int kk = 0; kk < m0_A; kk++)
     {
         double dk = d[A->row_perm[kk]];
@@ -519,7 +550,7 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
         }
     }
 
-    BTA_csc_pd_fill_values(B, A->kernel_dwork, m0_A, A->row_inv, C);
+    BTA_csc_denseT_fill_values(B, A->kernel_dwork, m0_A, A->row_inv, C);
 }
 
 /* Original transpose-via-Cprime implementation of BTDA_csc_pd_fill_values.
@@ -528,10 +559,10 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
 #if defined(__GNUC__) || defined(__clang__)
 __attribute__((unused))
 #endif
-static void BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B,
-                                                       const double *d,
-                                                       const permuted_dense *A,
-                                                       permuted_dense *C)
+static void
+BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B, const double *d,
+                                           const permuted_dense *A,
+                                           permuted_dense *C)
 {
     if (C->base.nnz == 0)
     {
diff --git a/src/utils/stacked_pd_linalg.c b/src/utils/stacked_pd_linalg.c
index 90a098f..dc46743 100644
--- a/src/utils/stacked_pd_linalg.c
+++ b/src/utils/stacked_pd_linalg.c
@@ -406,12 +406,29 @@ static void wrapper_BTDA_pd_pd(const permuted_dense *Bk, const double *d,
     BTDA_pd_pd_fill_values(Bk, d, (const permuted_dense *) ctx, Ck);
 }
 
+/* BTA per-block fill: plain B_k^T @ A, no diagonal. The shared skeleton
+   threads d through verbatim and never dereferences it, so the caller passes
+   d = NULL and we ignore it here. Calling BTA_pd_pd_fill_values directly (vs.
+   the BTDA wrapper) also sidesteps the per-block DA temp. */
+static void wrapper_BTA_pd_pd_fill(const permuted_dense *Bk, const double *d,
+                                   const void *ctx, permuted_dense *Ck)
+{
+    (void) d;
+    BTA_pd_pd_fill_values(Bk, (const permuted_dense *) ctx, Ck);
+}
+
 matrix *BTA_spd_pd_alloc(const stacked_pd *B, const permuted_dense *A)
 {
     return spd_blockwise_alloc_coalesce(B, B->base.n, A->base.n, wrapper_BTA_pd_pd,
                                         A);
 }
 
+void BTA_spd_pd_fill_values(const stacked_pd *B, const permuted_dense *A,
+                            stacked_pd *C)
+{
+    spd_blockwise_fill_coalesce_accumulate(B, NULL, A, C, wrapper_BTA_pd_pd_fill);
+}
+
 void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d,
                              const permuted_dense *A, stacked_pd *C)
 {
@@ -436,11 +453,26 @@ static void wrapper_BTDA_pd_csc(const permuted_dense *Bk, const double *d,
     BTDA_pd_csc_fill_values(Bk, d, (const CSC_matrix *) ctx, Ck);
 }
 
+/* BTA per-block fill: plain B_k^T @ A, no diagonal. The skeleton forwards d
+   verbatim and never dereferences it, so the caller passes d = NULL and we
+   ignore it. */
+static void wrapper_BTA_pd_csc_fill(const permuted_dense *Bk, const double *d,
+                                    const void *ctx, permuted_dense *Ck)
+{
+    (void) d;
+    BTA_pd_csc_fill_values(Bk, (const CSC_matrix *) ctx, Ck);
+}
+
 matrix *BTA_spd_csc_alloc(const stacked_pd *B, const CSC_matrix *A)
 {
     return spd_blockwise_alloc_coalesce(B, B->base.n, A->n, wrapper_BTA_pd_csc, A);
 }
 
+void BTA_spd_csc_fill_values(const stacked_pd *B, const CSC_matrix *A, stacked_pd *C)
+{
+    spd_blockwise_fill_coalesce_accumulate(B, NULL, A, C, wrapper_BTA_pd_csc_fill);
+}
+
 void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d,
                               const CSC_matrix *A, stacked_pd *C)
 {
@@ -465,12 +497,28 @@ static void wrapper_BTDA_pd_spd(const permuted_dense *Bk, const double *d,
     BTDA_pd_spd_fill_values(Bk, d, (const stacked_pd *) ctx, Ck);
 }
 
+/* BTA per-block fill: plain B_k^T @ A_spd, no diagonal. The skeleton forwards d
+   verbatim and never dereferences it, so the caller passes d = NULL and we
+   ignore it. Calling BTA_pd_spd_fill_values directly (vs. the BTDA wrapper) also
+   avoids the per-block DA temp. */
+static void wrapper_BTA_pd_spd_fill(const permuted_dense *Bk, const double *d,
+                                    const void *ctx, permuted_dense *Ck)
+{
+    (void) d;
+    BTA_pd_spd_fill_values(Bk, (const stacked_pd *) ctx, Ck);
+}
+
 matrix *BTA_spd_spd_alloc(const stacked_pd *B, const stacked_pd *A)
 {
     return spd_blockwise_alloc_coalesce(B, B->base.n, A->base.n, wrapper_BTA_pd_spd,
                                         A);
 }
 
+void BTA_spd_spd_fill_values(const stacked_pd *B, const stacked_pd *A, stacked_pd *C)
+{
+    spd_blockwise_fill_coalesce_accumulate(B, NULL, A, C, wrapper_BTA_pd_spd_fill);
+}
+
 void BTDA_spd_spd_fill_values(const stacked_pd *B, const double *d,
                               const stacked_pd *A, stacked_pd *C)
 {
@@ -500,11 +548,26 @@ static void wrapper_BTDA_csc_pd(const permuted_dense *Ak, const double *d,
     BTDA_csc_pd_fill_values((const CSC_matrix *) ctx, d, Ak, Ck);
 }
 
+/* BTA per-block fill: plain B^T @ A_k, no diagonal. The skeleton forwards d
+   verbatim and never dereferences it, so the caller passes d = NULL and we
+   ignore it. */
+static void wrapper_BTA_csc_pd_fill(const permuted_dense *Ak, const double *d,
+                                    const void *ctx, permuted_dense *Ck)
+{
+    (void) d;
+    BTA_csc_pd_fill_values((const CSC_matrix *) ctx, Ak, Ck);
+}
+
 matrix *BTA_csc_spd_alloc(const CSC_matrix *B, const stacked_pd *A)
 {
     return spd_blockwise_alloc_coalesce(A, B->n, A->base.n, wrapper_BTA_csc_pd, B);
 }
 
+void BTA_csc_spd_fill_values(const CSC_matrix *B, const stacked_pd *A, stacked_pd *C)
+{
+    spd_blockwise_fill_coalesce_accumulate(A, NULL, B, C, wrapper_BTA_csc_pd_fill);
+}
+
 void BTDA_csc_spd_fill_values(const CSC_matrix *B, const double *d,
                               const stacked_pd *A, stacked_pd *C)
 {
diff --git a/tests/all_tests.c b/tests/all_tests.c
index b680d0f..3e7bf03 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -438,14 +438,26 @@ int main(void)
     mu_run_test(test_BTDA_matrices_pd_pd, tests_run);
     mu_run_test(test_BTDA_matrices_csr_pd, tests_run);
     mu_run_test(test_BTDA_matrices_pd_csr, tests_run);
+    mu_run_test(test_BTA_csc_pd_basic, tests_run);
+    mu_run_test(test_BTA_csc_pd_partial_and_excluded_cols, tests_run);
+    mu_run_test(test_BTA_csc_pd_empty, tests_run);
     mu_run_test(test_BTDA_matrices_spd_pd, tests_run);
     mu_run_test(test_BTDA_matrices_pd_spd, tests_run);
+    mu_run_test(test_BTA_pd_csc_basic, tests_run);
+    mu_run_test(test_BTA_pd_csc_partial_and_excluded_cols, tests_run);
+    mu_run_test(test_BTA_pd_csc_empty, tests_run);
     mu_run_test(test_BTDA_matrices_spd_spd, tests_run);
     mu_run_test(test_BTA_pd_spd_two_blocks_both_kept, tests_run);
     mu_run_test(test_BTDA_pd_spd_two_blocks_both_kept, tests_run);
     mu_run_test(test_BTDA_spd_pd_overlapping_cp, tests_run);
+    mu_run_test(test_BTA_spd_pd_overlapping_cp, tests_run);
     mu_run_test(test_BTDA_spd_csc_overlapping_cp, tests_run);
+    mu_run_test(test_BTA_spd_csc_overlapping, tests_run);
+    mu_run_test(test_BTA_spd_csc_block_no_overlap, tests_run);
     mu_run_test(test_BTDA_spd_spd_overlapping, tests_run);
+    mu_run_test(test_BTA_spd_spd_overlapping, tests_run);
+    mu_run_test(test_BTA_spd_spd_multi_A_per_block, tests_run);
+    mu_run_test(test_BTA_spd_spd_nonoverlapping_block, tests_run);
     mu_run_test(test_BTA_spd_matrices_pd_A, tests_run);
     mu_run_test(test_BTA_spd_matrices_csc_A, tests_run);
     mu_run_test(test_BTA_spd_matrices_spd_A, tests_run);
@@ -453,9 +465,15 @@ int main(void)
     mu_run_test(test_BTA_pd_matrices_csc_A, tests_run);
     mu_run_test(test_BTA_pd_matrices_spd_A, tests_run);
     mu_run_test(test_BTDA_csc_spd_overlapping, tests_run);
+    mu_run_test(test_BTA_csc_spd_overlapping, tests_run);
+    mu_run_test(test_BTA_csc_spd_block_no_overlap, tests_run);
     mu_run_test(test_BTA_sparse_matrices_pd_A, tests_run);
     mu_run_test(test_BTA_sparse_matrices_csc_A, tests_run);
     mu_run_test(test_BTA_sparse_matrices_spd_A, tests_run);
+    mu_run_test(test_BTA_matrices_fill_pd_spd, tests_run);
+    mu_run_test(test_BTA_matrices_fill_spd_csc, tests_run);
+    mu_run_test(test_BTA_matrices_fill_csc_pd, tests_run);
+    mu_run_test(test_BTA_matrices_fill_csc_csc, tests_run);
     mu_run_test(test_BA_pd_kron_spd_no_cache_staleness, tests_run);
     mu_run_test(test_stacked_pd_construct_and_free, tests_run);
     mu_run_test(test_coalesce_no_overlap, tests_run);
diff --git a/tests/utils/test_matmul_dispatchers.h b/tests/utils/test_matmul_dispatchers.h
index 5620574..514ca02 100644
--- a/tests/utils/test_matmul_dispatchers.h
+++ b/tests/utils/test_matmul_dispatchers.h
@@ -194,6 +194,367 @@ static matrix *spd_to_sparse_matrix_copy(matrix *spd_m)
     return new_sparse_matrix(dst);
 }
 
+/* Primitive BTA_csc_pd kernel (no diagonal): C = B^T @ A, B is CSC, A is PD.
+   Route 1 calls BTA_csc_pd_alloc + BTA_csc_pd_fill_values directly. The oracle
+   flattens BOTH operands to sparse_matrix and runs the dispatcher with d = ones,
+   which routes through the csc/csc sparse_dot path — sharing no code with the
+   csc_pd kernel. Both compared via to_csr. */
+const char *test_BTA_csc_pd_basic(void)
+{
+    /* A: 4x5 PD, row_perm = [1,3], col_perm = [0,2] (both permuted, so row_inv
+       genuinely permutes and the transpose matters). */
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+
+    /* B: 4x4 CSR -> CSC. Rows: 0:{0,2} 1:{1} 2:{0} 3:{3}. */
+    int Bp[5] = {0, 2, 3, 4, 5};
+    int Bi[5] = {0, 2, 1, 0, 3};
+    double Bx[5] = {10.0, 20.0, 30.0, 40.0, 50.0};
+
+    CSR_matrix *B_csr = new_CSR_matrix(4, 4, 5);
+    memcpy(B_csr->p, Bp, sizeof Bp);
+    memcpy(B_csr->i, Bi, sizeof Bi);
+    memcpy(B_csr->x, Bx, sizeof Bx);
+    int *iwork = (int *) malloc(MAX(B_csr->m, B_csr->n) * sizeof(int));
+    CSC_matrix *B_csc = csr_to_csc_alloc(B_csr, iwork);
+    csr_to_csc_fill_values(B_csr, B_csc, iwork);
+
+    /* Route 1: our kernel. */
+    matrix *C_ours = BTA_csc_pd_alloc(B_csc, (permuted_dense *) A_m);
+    BTA_csc_pd_fill_values(B_csc, (permuted_dense *) A_m, (permuted_dense *) C_ours);
+
+    /* Oracle: both operands sparse, d = ones, csc/csc dispatch. */
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_m);
+    CSR_matrix *B_csr2 = new_CSR_matrix(4, 4, 5);
+    memcpy(B_csr2->p, Bp, sizeof Bp);
+    memcpy(B_csr2->i, Bi, sizeof Bi);
+    memcpy(B_csr2->x, Bx, sizeof Bx);
+    matrix *B_sparse = new_sparse_matrix(B_csr2);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_CSC_matrix(B_csc);
+    free_CSR_matrix(B_csr);
+    free(iwork);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA_csc_pd corner case: B columns that (a) hit no A-row -> excluded from C's
+   row_perm by BTA_csc_pd_alloc; (b) partially overlap A's rows (some B->i map to
+   row_inv == -1) -> exercises sparse_dot_dense's -1 filtering; (c) fully land in
+   A's rows. A->row_perm = {1,3}. B columns by source rows: col0:{0,2} (excluded),
+   col1:{1} (clean), col2:{0,1,3} (partial), col3:{3} (clean). */
+const char *test_BTA_csc_pd_partial_and_excluded_cols(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+
+    /* B: 4x4. Rows: 0:{0,2} 1:{1,2} 2:{0} 3:{2,3}. */
+    int Bp[5] = {0, 2, 4, 5, 7};
+    int Bi[7] = {0, 2, 1, 2, 0, 2, 3};
+    double Bx[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+
+    CSR_matrix *B_csr = new_CSR_matrix(4, 4, 7);
+    memcpy(B_csr->p, Bp, sizeof Bp);
+    memcpy(B_csr->i, Bi, sizeof Bi);
+    memcpy(B_csr->x, Bx, sizeof Bx);
+    int *iwork = (int *) malloc(MAX(B_csr->m, B_csr->n) * sizeof(int));
+    CSC_matrix *B_csc = csr_to_csc_alloc(B_csr, iwork);
+    csr_to_csc_fill_values(B_csr, B_csc, iwork);
+
+    matrix *C_ours = BTA_csc_pd_alloc(B_csc, (permuted_dense *) A_m);
+    BTA_csc_pd_fill_values(B_csc, (permuted_dense *) A_m, (permuted_dense *) C_ours);
+
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_m);
+    CSR_matrix *B_csr2 = new_CSR_matrix(4, 4, 7);
+    memcpy(B_csr2->p, Bp, sizeof Bp);
+    memcpy(B_csr2->i, Bi, sizeof Bi);
+    memcpy(B_csr2->x, Bx, sizeof Bx);
+    matrix *B_sparse = new_sparse_matrix(B_csr2);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_CSC_matrix(B_csc);
+    free_CSR_matrix(B_csr);
+    free(iwork);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* BTA_csc_pd corner case: every B column misses A's rows -> C->base.nnz == 0
+   early-out. A->row_perm = {1,3}; B has nonzeros only in rows {0,2}. */
+const char *test_BTA_csc_pd_empty(void)
+{
+    int row_perm_A[2] = {1, 3};
+    int col_perm_A[2] = {0, 2};
+    double XA[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *A_m = new_permuted_dense(4, 5, 2, 2, row_perm_A, col_perm_A, XA);
+
+    /* B: 4x4, nonzeros only in rows 0 and 2. Rows: 0:{0,1} 2:{2,3}. */
+    int Bp[5] = {0, 2, 2, 4, 4};
+    int Bi[4] = {0, 1, 2, 3};
+    double Bx[4] = {1.0, 2.0, 3.0, 4.0};
+
+    CSR_matrix *B_csr = new_CSR_matrix(4, 4, 4);
+    memcpy(B_csr->p, Bp, sizeof Bp);
+    memcpy(B_csr->i, Bi, sizeof Bi);
+    memcpy(B_csr->x, Bx, sizeof Bx);
+    int *iwork = (int *) malloc(MAX(B_csr->m, B_csr->n) * sizeof(int));
+    CSC_matrix *B_csc = csr_to_csc_alloc(B_csr, iwork);
+    csr_to_csc_fill_values(B_csr, B_csc, iwork);
+
+    matrix *C_ours = BTA_csc_pd_alloc(B_csc, (permuted_dense *) A_m);
+    BTA_csc_pd_fill_values(B_csc, (permuted_dense *) A_m, (permuted_dense *) C_ours);
+
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_m);
+    CSR_matrix *B_csr2 = new_CSR_matrix(4, 4, 4);
+    memcpy(B_csr2->p, Bp, sizeof Bp);
+    memcpy(B_csr2->i, Bi, sizeof Bi);
+    memcpy(B_csr2->x, Bx, sizeof Bx);
+    matrix *B_sparse = new_sparse_matrix(B_csr2);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("nnz zero", csr_ours->nnz == 0);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_CSC_matrix(B_csc);
+    free_CSR_matrix(B_csr);
+    free(iwork);
+    free_matrix(A_m);
+    return 0;
+}
+
+/* Primitive BTA_pd_csc kernel (no diagonal): C = B^T @ A, B is PD, A is CSC.
+   Mirror of the csc_pd tests with roles swapped. Route 1 calls BTA_pd_csc_alloc
+   + BTA_pd_csc_fill_values. Oracle flattens BOTH operands to sparse_matrix and
+   runs the dispatcher with d = ones (csc/csc sparse_dot path), sharing no code
+   with the pd_csc kernel. Both compared via to_csr. */
+const char *test_BTA_pd_csc_basic(void)
+{
+    /* B: 4x5 PD, row_perm = [1,3], col_perm = [0,2]. */
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(4, 5, 2, 2, row_perm_B, col_perm_B, XB);
+
+    /* A: 4x4 CSR -> CSC. Rows: 0:{1} 1:{0,2} 2:{3} 3:{1,3}. All A-columns
+       overlap B's rows {1,3} (some partially). */
+    int Ap[5] = {0, 1, 3, 4, 6};
+    int Ai[6] = {1, 0, 2, 3, 1, 3};
+    double Ax[6] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0};
+
+    CSR_matrix *A_csr = new_CSR_matrix(4, 4, 6);
+    memcpy(A_csr->p, Ap, sizeof Ap);
+    memcpy(A_csr->i, Ai, sizeof Ai);
+    memcpy(A_csr->x, Ax, sizeof Ax);
+    int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int));
+    CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork);
+    csr_to_csc_fill_values(A_csr, A_csc, iwork);
+
+    /* Route 1: our kernel. */
+    matrix *C_ours = BTA_pd_csc_alloc((permuted_dense *) B_m, A_csc);
+    BTA_pd_csc_fill_values((permuted_dense *) B_m, A_csc, (permuted_dense *) C_ours);
+
+    /* Oracle: both operands sparse, d = ones, csc/csc dispatch. Dispatcher arg
+       order is (rightA, leftB) and computes B^T @ A, so A goes first. */
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_m);
+    CSR_matrix *A_csr2 = new_CSR_matrix(4, 4, 6);
+    memcpy(A_csr2->p, Ap, sizeof Ap);
+    memcpy(A_csr2->i, Ai, sizeof Ai);
+    memcpy(A_csr2->x, Ax, sizeof Ax);
+    matrix *A_sparse = new_sparse_matrix(A_csr2);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(A_sparse);
+    free_matrix(B_sparse);
+    free_matrix(C_ours);
+    free_CSC_matrix(A_csc);
+    free_CSR_matrix(A_csr);
+    free(iwork);
+    free_matrix(B_m);
+    return 0;
+}
+
+/* BTA_pd_csc corner case: A columns that (a) hit no B-row -> excluded from C's
+   col_perm by BTA_pd_csc_alloc; (b) partially overlap B's rows (some A->i map to
+   row_inv == -1) -> exercises sparse_dot_dense's -1 filtering; (c) fully land in
+   B's rows. B->row_perm = {1,3}. A columns by source rows: col0:{0,2} (excluded),
+   col1:{1} (clean), col2:{0,1,3} (partial), col3:{3} (clean). */
+const char *test_BTA_pd_csc_partial_and_excluded_cols(void)
+{
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(4, 5, 2, 2, row_perm_B, col_perm_B, XB);
+
+    /* A: 4x4. Rows: 0:{0,2} 1:{1,2} 2:{0} 3:{2,3}. */
+    int Ap[5] = {0, 2, 4, 5, 7};
+    int Ai[7] = {0, 2, 1, 2, 0, 2, 3};
+    double Ax[7] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
+
+    CSR_matrix *A_csr = new_CSR_matrix(4, 4, 7);
+    memcpy(A_csr->p, Ap, sizeof Ap);
+    memcpy(A_csr->i, Ai, sizeof Ai);
+    memcpy(A_csr->x, Ax, sizeof Ax);
+    int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int));
+    CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork);
+    csr_to_csc_fill_values(A_csr, A_csc, iwork);
+
+    matrix *C_ours = BTA_pd_csc_alloc((permuted_dense *) B_m, A_csc);
+    BTA_pd_csc_fill_values((permuted_dense *) B_m, A_csc, (permuted_dense *) C_ours);
+
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_m);
+    CSR_matrix *A_csr2 = new_CSR_matrix(4, 4, 7);
+    memcpy(A_csr2->p, Ap, sizeof Ap);
+    memcpy(A_csr2->i, Ai, sizeof Ai);
+    memcpy(A_csr2->x, Ax, sizeof Ax);
+    matrix *A_sparse = new_sparse_matrix(A_csr2);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(A_sparse);
+    free_matrix(B_sparse);
+    free_matrix(C_ours);
+    free_CSC_matrix(A_csc);
+    free_CSR_matrix(A_csr);
+    free(iwork);
+    free_matrix(B_m);
+    return 0;
+}
+
+/* BTA_pd_csc corner case: every A column misses B's rows -> C->base.nnz == 0
+   early-out. B->row_perm = {1,3}; A has nonzeros only in rows {0,2}. */
+const char *test_BTA_pd_csc_empty(void)
+{
+    int row_perm_B[2] = {1, 3};
+    int col_perm_B[2] = {0, 2};
+    double XB[4] = {1.0, 2.0, 3.0, 4.0};
+    matrix *B_m = new_permuted_dense(4, 5, 2, 2, row_perm_B, col_perm_B, XB);
+
+    /* A: 4x4, nonzeros only in rows 0 and 2. Rows: 0:{0,1} 2:{2,3}. */
+    int Ap[5] = {0, 2, 2, 4, 4};
+    int Ai[4] = {0, 1, 2, 3};
+    double Ax[4] = {1.0, 2.0, 3.0, 4.0};
+
+    CSR_matrix *A_csr = new_CSR_matrix(4, 4, 4);
+    memcpy(A_csr->p, Ap, sizeof Ap);
+    memcpy(A_csr->i, Ai, sizeof Ai);
+    memcpy(A_csr->x, Ax, sizeof Ax);
+    int *iwork = (int *) malloc(MAX(A_csr->m, A_csr->n) * sizeof(int));
+    CSC_matrix *A_csc = csr_to_csc_alloc(A_csr, iwork);
+    csr_to_csc_fill_values(A_csr, A_csc, iwork);
+
+    matrix *C_ours = BTA_pd_csc_alloc((permuted_dense *) B_m, A_csc);
+    BTA_pd_csc_fill_values((permuted_dense *) B_m, A_csc, (permuted_dense *) C_ours);
+
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_m);
+    CSR_matrix *A_csr2 = new_CSR_matrix(4, 4, 4);
+    memcpy(A_csr2->p, Ap, sizeof Ap);
+    memcpy(A_csr2->i, Ai, sizeof Ai);
+    memcpy(A_csr2->x, Ax, sizeof Ax);
+    matrix *A_sparse = new_sparse_matrix(A_csr2);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("nnz zero", csr_ours->nnz == 0);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+
+    free_matrix(C_ref);
+    free_matrix(A_sparse);
+    free_matrix(B_sparse);
+    free_matrix(C_ours);
+    free_CSC_matrix(A_csc);
+    free_CSR_matrix(A_csr);
+    free(iwork);
+    free_matrix(B_m);
+    return 0;
+}
+
 /* Wrapper dispatch: (A=spd, B=PD). The spd-fallback path goes through
    BTA_csc_pd_alloc / BTDA_csc_pd_fill_values; result must match the
    equivalent (A=sparse, B=PD) dispatch on the same matrix. */
@@ -561,6 +922,63 @@ const char *test_BTDA_spd_pd_overlapping_cp(void)
     return 0;
 }
 
+/* Primitive BTA_spd_pd kernel (no diagonal). Same matrices as the BTDA
+   variant above. Reference is the production dispatcher with B flattened to a
+   sparse_matrix and d = ones, which equals plain B^T @ A. */
+const char *test_BTA_spd_pd_overlapping_cp(void)
+{
+    /* B: 4x3 spd, two blocks with overlapping col_perms (share col 2).
+       blk0: rows {0,1}, cols {0,2}, X = [[1,2],[3,4]]
+       blk1: rows {2,3}, cols {1,2}, X = [[5,6],[7,8]]                     */
+    int B0_rp[2] = {0, 1};
+    int B0_cp[2] = {0, 2};
+    double B0X[4] = {1, 2, 3, 4};
+    matrix *blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X);
+    int B1_rp[2] = {2, 3};
+    int B1_cp[2] = {1, 2};
+    double B1X[4] = {5, 6, 7, 8};
+    matrix *blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X);
+    permuted_dense *B_blocks[2] = {(permuted_dense *) blk0, (permuted_dense *) blk1};
+    matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL);
+
+    /* A: 4x3 PD, full row_perm, full col_perm. */
+    int A_rp[4] = {0, 1, 2, 3};
+    int A_cp[3] = {0, 1, 2};
+    double AX[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+    matrix *A = new_permuted_dense(4, 3, 4, 3, A_rp, A_cp, AX);
+
+    /* Route 1: our new BTA_spd_pd_fill_values. */
+    matrix *C_ours = BTA_spd_pd_alloc((stacked_pd *) B_spd, (permuted_dense *) A);
+    BTA_spd_pd_fill_values((stacked_pd *) B_spd, (permuted_dense *) A,
+                           (stacked_pd *) C_ours);
+
+    /* Route 2: dispatcher with B flattened to sparse_matrix and d = ones, so
+       BTDA == BTA. Note BTA_matrices_alloc(A, B) computes B^T @ A, so A goes
+       first. */
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd);
+    matrix *C_ref = BTA_matrices_alloc(A, B_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A, d_ones, B_sparse, C_ref);
+
+    /* C_ours is stacked_pd, C_ref is permuted_dense — compare via to_csr. */
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(C_ours);
+    free_matrix(A);
+    free_matrix(B_spd);
+    return 0;
+}
+
 /* Primitive BTDA_spd_csc kernel (ATA-style direct: per-block BTDA_pd_csc
    + accumulating coalesce). C = B^T @ diag(d) @ A with B a 2-block
    stacked_pd whose col_perms share column 2, and A a sparse_matrix
@@ -636,47 +1054,169 @@ const char *test_BTDA_spd_csc_overlapping_cp(void)
     return 0;
 }
 
-/* Primitive BTDA_spd_spd kernel (per-block BTDA_pd_spd + accumulating coalesce).
-   C = B^T @ diag(d) @ A with both B and A 2-block stacked_pds. B's c_k's
-   share col 2 (outer coalesce-accumulate path), and A's col_perms also
-   share at least one column (inner BTA_pd_spd accumulate path). Reference
-   is the production dispatcher with both operands flattened to sparse_matrix.
-   Output types differ (stacked_pd vs sparse_matrix); compare via to_csr. */
-const char *test_BTDA_spd_spd_overlapping(void)
+/* Primitive BTA_spd_csc kernel (no diagonal): C = B^T @ A, B is a 2-block
+   stacked_pd whose col_perms share a column (accumulating coalesce fires), A is
+   CSC. Same matrices as the BTDA variant above. Oracle is the dispatcher with B
+   flattened to sparse and d = ones (csc/csc sparse_dot path). */
+const char *test_BTA_spd_csc_overlapping(void)
 {
-    /* B: 4x3 spd, two blocks with overlapping c_k(B) (share col 2). */
+    /* B: 4x3 spd, two blocks with overlapping col_perms (share col 2). */
     int B0_rp[2] = {0, 1};
     int B0_cp[2] = {0, 2};
     double B0X[4] = {1, 2, 3, 4};
-    matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X);
+    matrix *blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X);
     int B1_rp[2] = {2, 3};
     int B1_cp[2] = {1, 2};
     double B1X[4] = {5, 6, 7, 8};
-    matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X);
-    permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0,
-                                   (permuted_dense *) B_blk1};
+    matrix *blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X);
+    permuted_dense *B_blocks[2] = {(permuted_dense *) blk0, (permuted_dense *) blk1};
     matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL);
 
-    /* A: 4x3 spd, two blocks. A_0's rows overlap B_0's, A_1's rows overlap
-       B_1's. A_0 and A_1 share col 1 → inner BTA_pd_spd accumulate fires. */
-    int A0_rp[2] = {0, 1};
-    int A0_cp[2] = {0, 1};
-    double A0X[4] = {10, 11, 12, 13};
-    matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X);
-    int A1_rp[2] = {2, 3};
-    int A1_cp[2] = {1, 2};
-    double A1X[4] = {20, 21, 22, 23};
-    matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X);
-    permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0,
-                                   (permuted_dense *) A_blk1};
-    matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL);
-
-    double d[4] = {2.0, -1.5, 0.5, 1.25};
+    /* A: 4x3 sparse_matrix wrapping a CSR. Rows: 0:{0,2} 1:{1} 2:{0,1,2} 3:{2}. */
+    CSR_matrix *A_csr = new_CSR_matrix(4, 3, 7);
+    int Ap[5] = {0, 2, 3, 6, 7};
+    int Ai[7] = {0, 2, 1, 0, 1, 2, 2};
+    double Ax[7] = {1, 2, 3, 4, 5, 6, 7};
+    memcpy(A_csr->p, Ap, sizeof(Ap));
+    memcpy(A_csr->i, Ai, sizeof(Ai));
+    memcpy(A_csr->x, Ax, sizeof(Ax));
+    matrix *A_sm = new_sparse_matrix(A_csr);
 
-    /* Route 1: new kernel. */
-    matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd);
-    BTDA_spd_spd_fill_values((stacked_pd *) B_spd, d, (stacked_pd *) A_spd,
-                             (stacked_pd *) C_ours);
+    /* Route 1: new kernel. Need A's csc_cache. */
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) A_sm);
+    A_sm->refresh_csc_values(A_sm);
+    matrix *C_ours =
+        BTA_spd_csc_alloc((stacked_pd *) B_spd, ((sparse_matrix *) A_sm)->csc_cache);
+    BTA_spd_csc_fill_values((stacked_pd *) B_spd,
+                            ((sparse_matrix *) A_sm)->csc_cache,
+                            (stacked_pd *) C_ours);
+
+    /* Route 2: dispatcher with B flattened to sparse and d = ones, so BTDA == BTA.
+       BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd);
+    matrix *C_ref = BTA_matrices_alloc(A_sm, B_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sm, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(C_ours);
+    free_matrix(A_sm);
+    free_matrix(B_spd);
+    return 0;
+}
+
+/* BTA_spd_csc corner case: one B-block whose rows match no nonzero row of A ->
+   that block's per-block partial is empty (m0 = 0), while the other block
+   contributes. Exercises the empty-source-block path through BTA_pd_csc_alloc
+   and the accumulating coalesce. */
+const char *test_BTA_spd_csc_block_no_overlap(void)
+{
+    /* B: 4x3 spd, two blocks. blk0 rows {0,1}; blk1 rows {2,3}. */
+    int B0_rp[2] = {0, 1};
+    int B0_cp[2] = {0, 1};
+    double B0X[4] = {1, 2, 3, 4};
+    matrix *blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X);
+    int B1_rp[2] = {2, 3};
+    int B1_cp[2] = {1, 2};
+    double B1X[4] = {5, 6, 7, 8};
+    matrix *blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X);
+    permuted_dense *B_blocks[2] = {(permuted_dense *) blk0, (permuted_dense *) blk1};
+    matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL);
+
+    /* A: 4x3, nonzeros only in rows {0,1} -> overlaps blk0 only; blk1's rows
+       {2,3} hit no A nonzero -> empty partial. Rows: 0:{0,2} 1:{1}. */
+    CSR_matrix *A_csr = new_CSR_matrix(4, 3, 3);
+    int Ap[5] = {0, 2, 3, 3, 3};
+    int Ai[3] = {0, 2, 1};
+    double Ax[3] = {1, 2, 3};
+    memcpy(A_csr->p, Ap, sizeof(Ap));
+    memcpy(A_csr->i, Ai, sizeof(Ai));
+    memcpy(A_csr->x, Ax, sizeof(Ax));
+    matrix *A_sm = new_sparse_matrix(A_csr);
+
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) A_sm);
+    A_sm->refresh_csc_values(A_sm);
+    matrix *C_ours =
+        BTA_spd_csc_alloc((stacked_pd *) B_spd, ((sparse_matrix *) A_sm)->csc_cache);
+    BTA_spd_csc_fill_values((stacked_pd *) B_spd,
+                            ((sparse_matrix *) A_sm)->csc_cache,
+                            (stacked_pd *) C_ours);
+
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd);
+    matrix *C_ref = BTA_matrices_alloc(A_sm, B_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sm, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(C_ours);
+    free_matrix(A_sm);
+    free_matrix(B_spd);
+    return 0;
+}
+
+/* Primitive BTDA_spd_spd kernel (per-block BTDA_pd_spd + accumulating coalesce).
+   C = B^T @ diag(d) @ A with both B and A 2-block stacked_pds. B's c_k's
+   share col 2 (outer coalesce-accumulate path), and A's col_perms also
+   share at least one column (inner BTA_pd_spd accumulate path). Reference
+   is the production dispatcher with both operands flattened to sparse_matrix.
+   Output types differ (stacked_pd vs sparse_matrix); compare via to_csr. */
+const char *test_BTDA_spd_spd_overlapping(void)
+{
+    /* B: 4x3 spd, two blocks with overlapping c_k(B) (share col 2). */
+    int B0_rp[2] = {0, 1};
+    int B0_cp[2] = {0, 2};
+    double B0X[4] = {1, 2, 3, 4};
+    matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X);
+    int B1_rp[2] = {2, 3};
+    int B1_cp[2] = {1, 2};
+    double B1X[4] = {5, 6, 7, 8};
+    matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X);
+    permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0,
+                                   (permuted_dense *) B_blk1};
+    matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL);
+
+    /* A: 4x3 spd, two blocks. A_0's rows overlap B_0's, A_1's rows overlap
+       B_1's. A_0 and A_1 share col 1 → inner BTA_pd_spd accumulate fires. */
+    int A0_rp[2] = {0, 1};
+    int A0_cp[2] = {0, 1};
+    double A0X[4] = {10, 11, 12, 13};
+    matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X);
+    int A1_rp[2] = {2, 3};
+    int A1_cp[2] = {1, 2};
+    double A1X[4] = {20, 21, 22, 23};
+    matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X);
+    permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0,
+                                   (permuted_dense *) A_blk1};
+    matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL);
+
+    double d[4] = {2.0, -1.5, 0.5, 1.25};
+
+    /* Route 1: new kernel. */
+    matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd);
+    BTDA_spd_spd_fill_values((stacked_pd *) B_spd, d, (stacked_pd *) A_spd,
+                             (stacked_pd *) C_ours);
 
     /* Route 2: dispatcher with both flattened to sparse_matrix.
        BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */
@@ -706,6 +1246,188 @@ const char *test_BTDA_spd_spd_overlapping(void)
     return 0;
 }
 
+/* Primitive BTA_spd_spd kernel (no diagonal). Same matrices as the BTDA
+   variant above. Reference is the production dispatcher with both operands
+   flattened to sparse_matrix and d = ones, which equals plain B^T @ A. */
+const char *test_BTA_spd_spd_overlapping(void)
+{
+    /* B: 4x3 spd, two blocks with overlapping c_k(B) (share col 2). */
+    int B0_rp[2] = {0, 1};
+    int B0_cp[2] = {0, 2};
+    double B0X[4] = {1, 2, 3, 4};
+    matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X);
+    int B1_rp[2] = {2, 3};
+    int B1_cp[2] = {1, 2};
+    double B1X[4] = {5, 6, 7, 8};
+    matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X);
+    permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0,
+                                   (permuted_dense *) B_blk1};
+    matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL);
+
+    /* A: 4x3 spd, two blocks. A_0's rows overlap B_0's, A_1's rows overlap
+       B_1's. A_0 and A_1 share col 1. */
+    int A0_rp[2] = {0, 1};
+    int A0_cp[2] = {0, 1};
+    double A0X[4] = {10, 11, 12, 13};
+    matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X);
+    int A1_rp[2] = {2, 3};
+    int A1_cp[2] = {1, 2};
+    double A1X[4] = {20, 21, 22, 23};
+    matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X);
+    permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0,
+                                   (permuted_dense *) A_blk1};
+    matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL);
+
+    /* Route 1: new kernel. */
+    matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd);
+    BTA_spd_spd_fill_values((stacked_pd *) B_spd, (stacked_pd *) A_spd,
+                            (stacked_pd *) C_ours);
+
+    /* Route 2: dispatcher with both flattened to sparse_matrix and d = ones, so
+       BTDA == BTA. BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd);
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    /* C_ours is stacked_pd, C_ref is sparse_matrix — compare via to_csr. */
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_matrix(A_spd);
+    free_matrix(B_spd);
+    return 0;
+}
+
+/* BTA_spd_spd corner case: a single B-block whose rows span MULTIPLE A-blocks,
+   forcing BTA_pd_spd_fill_values' inner += accumulation over A-blocks into one
+   partial, with overlapping output columns (A_0 and A_1 share col 1). Not
+   exercised by test_BTA_spd_spd_overlapping (there each B-block overlaps exactly
+   one A-block). */
+const char *test_BTA_spd_spd_multi_A_per_block(void)
+{
+    /* B: 4x2, one block spanning all four contraction rows. */
+    int B0_rp[4] = {0, 1, 2, 3};
+    int B0_cp[2] = {0, 1};
+    double B0X[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+    matrix *B_blk0 = new_permuted_dense(4, 2, 4, 2, B0_rp, B0_cp, B0X);
+    permuted_dense *B_blocks[1] = {(permuted_dense *) B_blk0};
+    matrix *B_spd = new_stacked_pd(4, 2, 1, B_blocks, NULL, NULL);
+
+    /* A: 4x3, two blocks with disjoint rows but overlapping cols (share col 1).
+       Both overlap B_0's rows, so both contribute to B_0's single partial. */
+    int A0_rp[2] = {0, 1};
+    int A0_cp[2] = {0, 1};
+    double A0X[4] = {10, 11, 12, 13};
+    matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X);
+    int A1_rp[2] = {2, 3};
+    int A1_cp[2] = {1, 2};
+    double A1X[4] = {20, 21, 22, 23};
+    matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X);
+    permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0,
+                                   (permuted_dense *) A_blk1};
+    matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL);
+
+    matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd);
+    BTA_spd_spd_fill_values((stacked_pd *) B_spd, (stacked_pd *) A_spd,
+                            (stacked_pd *) C_ours);
+
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd);
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_matrix(A_spd);
+    free_matrix(B_spd);
+    return 0;
+}
+
+/* BTA_spd_spd corner case: a B-block that overlaps NO A-block, producing an
+   empty partial (n0 = 0) that coalesces to a zero-column output block (output
+   row 1, all zeros). Exercises the partial-level nnz == 0 early-out in
+   BTA_pd_spd_fill_values and the empty-block path through coalesce/to_csr. */
+const char *test_BTA_spd_spd_nonoverlapping_block(void)
+{
+    /* B: 4x2, two blocks. B_0 rows {0,1}; B_1 rows {2,3}. */
+    int B0_rp[2] = {0, 1};
+    int B0_cp[1] = {0};
+    double B0X[2] = {1, 2};
+    matrix *B_blk0 = new_permuted_dense(4, 2, 2, 1, B0_rp, B0_cp, B0X);
+    int B1_rp[2] = {2, 3};
+    int B1_cp[1] = {1};
+    double B1X[2] = {3, 4};
+    matrix *B_blk1 = new_permuted_dense(4, 2, 2, 1, B1_rp, B1_cp, B1X);
+    permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0,
+                                   (permuted_dense *) B_blk1};
+    matrix *B_spd = new_stacked_pd(4, 2, 2, B_blocks, NULL, NULL);
+
+    /* A: 4x2, one block on rows {0,1}. B_0 overlaps it; B_1 does not. */
+    int A0_rp[2] = {0, 1};
+    int A0_cp[2] = {0, 1};
+    double A0X[4] = {10, 11, 12, 13};
+    matrix *A_blk0 = new_permuted_dense(4, 2, 2, 2, A0_rp, A0_cp, A0X);
+    permuted_dense *A_blocks[1] = {(permuted_dense *) A_blk0};
+    matrix *A_spd = new_stacked_pd(4, 2, 1, A_blocks, NULL, NULL);
+
+    matrix *C_ours = BTA_spd_spd_alloc((stacked_pd *) B_spd, (stacked_pd *) A_spd);
+    BTA_spd_spd_fill_values((stacked_pd *) B_spd, (stacked_pd *) A_spd,
+                            (stacked_pd *) C_ours);
+
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd);
+    matrix *B_sparse = spd_to_sparse_matrix_copy(B_spd);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sparse);
+    A_sparse->refresh_csc_values(A_sparse);
+    B_sparse->refresh_csc_values(B_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sparse, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(B_sparse);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_matrix(A_spd);
+    free_matrix(B_spd);
+    return 0;
+}
+
 /* BTA_spd_matrices dispatcher: A is permuted_dense. Verifies the
    PD-branch routes to BTA_spd_pd / BTDA_spd_pd. */
 const char *test_BTA_spd_matrices_pd_A(void)
@@ -1121,6 +1843,128 @@ const char *test_BTDA_csc_spd_overlapping(void)
     return 0;
 }
 
+/* Primitive BTA_csc_spd kernel (no diagonal): C = B^T @ A, B is CSC, A is a
+   2-block stacked_pd whose col_perms share a column (so the accumulating coalesce
+   path fires). Same matrices as the BTDA variant above. Oracle is the dispatcher
+   with A flattened to sparse and d = ones (csc/csc sparse_dot path). */
+const char *test_BTA_csc_spd_overlapping(void)
+{
+    /* A: 4x3 spd, two blocks with overlapping col_perms (share col 2). */
+    int A0_rp[2] = {0, 1};
+    int A0_cp[2] = {0, 2};
+    double A0X[4] = {1, 2, 3, 4};
+    matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X);
+    int A1_rp[2] = {2, 3};
+    int A1_cp[2] = {1, 2};
+    double A1X[4] = {5, 6, 7, 8};
+    matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X);
+    permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0,
+                                   (permuted_dense *) A_blk1};
+    matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL);
+
+    /* B: 4x3 sparse_matrix wrapping a CSR. */
+    CSR_matrix *B_csr = new_CSR_matrix(4, 3, 7);
+    int Bp[5] = {0, 2, 3, 6, 7};
+    int Bi[7] = {0, 2, 1, 0, 1, 2, 2};
+    double Bx[7] = {1, 2, 3, 4, 5, 6, 7};
+    memcpy(B_csr->p, Bp, sizeof(Bp));
+    memcpy(B_csr->i, Bi, sizeof(Bi));
+    memcpy(B_csr->x, Bx, sizeof(Bx));
+    matrix *B_sm = new_sparse_matrix(B_csr);
+
+    /* Route 1: new kernel. Ensure B's csc_cache and refresh values. */
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) B_sm);
+    B_sm->refresh_csc_values(B_sm);
+    matrix *C_ours =
+        BTA_csc_spd_alloc(((sparse_matrix *) B_sm)->csc_cache, (stacked_pd *) A_spd);
+    BTA_csc_spd_fill_values(((sparse_matrix *) B_sm)->csc_cache,
+                            (stacked_pd *) A_spd, (stacked_pd *) C_ours);
+
+    /* Route 2: dispatcher with A flattened to sparse_matrix and d = ones, so
+       BTDA == BTA. BTA_matrices_alloc(A, B) computes B^T @ A, so A goes first. */
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sm);
+    A_sparse->refresh_csc_values(A_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sm, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_matrix(B_sm);
+    free_matrix(A_spd);
+    return 0;
+}
+
+/* BTA_csc_spd corner case: one A-block whose row_perm matches no nonzero row of
+   any B column -> that block's per-block partial is empty (m0 = 0), while the
+   other block contributes. Exercises the empty-source-block path through
+   BTA_csc_pd_alloc and the accumulating coalesce. */
+const char *test_BTA_csc_spd_block_no_overlap(void)
+{
+    /* A: 4x3 spd, two blocks. blk0 rows {0,1}; blk1 rows {2,3}. */
+    int A0_rp[2] = {0, 1};
+    int A0_cp[2] = {0, 1};
+    double A0X[4] = {1, 2, 3, 4};
+    matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X);
+    int A1_rp[2] = {2, 3};
+    int A1_cp[2] = {1, 2};
+    double A1X[4] = {5, 6, 7, 8};
+    matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X);
+    permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0,
+                                   (permuted_dense *) A_blk1};
+    matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL);
+
+    /* B: 4x3, nonzeros only in rows {0,1} -> overlaps blk0 only; blk1's rows
+       {2,3} hit no B column -> empty partial for blk1. Rows: 0:{0,2} 1:{1}. */
+    CSR_matrix *B_csr = new_CSR_matrix(4, 3, 3);
+    int Bp[5] = {0, 2, 3, 3, 3};
+    int Bi[3] = {0, 2, 1};
+    double Bx[3] = {1, 2, 3};
+    memcpy(B_csr->p, Bp, sizeof(Bp));
+    memcpy(B_csr->i, Bi, sizeof(Bi));
+    memcpy(B_csr->x, Bx, sizeof(Bx));
+    matrix *B_sm = new_sparse_matrix(B_csr);
+
+    sparse_matrix_ensure_csc_cache((sparse_matrix *) B_sm);
+    B_sm->refresh_csc_values(B_sm);
+    matrix *C_ours =
+        BTA_csc_spd_alloc(((sparse_matrix *) B_sm)->csc_cache, (stacked_pd *) A_spd);
+    BTA_csc_spd_fill_values(((sparse_matrix *) B_sm)->csc_cache,
+                            (stacked_pd *) A_spd, (stacked_pd *) C_ours);
+
+    matrix *A_sparse = spd_to_sparse_matrix_copy(A_spd);
+    matrix *C_ref = BTA_matrices_alloc(A_sparse, B_sm);
+    A_sparse->refresh_csc_values(A_sparse);
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+    BTDA_matrices_fill_values(A_sparse, d_ones, B_sm, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(A_sparse);
+    free_matrix(C_ours);
+    free_matrix(B_sm);
+    free_matrix(A_spd);
+    return 0;
+}
+
 /* BTA_sparse_matrices dispatcher: A is permuted_dense. Both routes produce
    PD; compare X / perms directly. */
 const char *test_BTA_sparse_matrices_pd_A(void)
@@ -1322,6 +2166,203 @@ const char *test_BTA_sparse_matrices_spd_A(void)
     return 0;
 }
 
+/* Top-level BTA_matrices_fill_values dispatcher. Each test runs Route 1 =
+   BTA_matrices_alloc + BTA_matrices_fill_values and Route 2 = the same alloc +
+   BTDA_matrices_fill_values with d = ones (B^T diag(1) A = B^T A). The two
+   share structure, so we compare values via to_csr. Together the four cover all
+   three B-branches and the special csc/csc (d == NULL) path. Recall
+   BTA_matrices_alloc(A, B) computes B^T @ A, so the transposed operand goes
+   second. */
+const char *test_BTA_matrices_fill_pd_spd(void)
+{
+    /* B: 4x5 PD (transposed operand). */
+    int B_rp[3] = {0, 1, 2};
+    int B_cp[3] = {1, 3, 4};
+    double BX[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    matrix *B = new_permuted_dense(4, 5, 3, 3, B_rp, B_cp, BX);
+
+    /* A: 4x3 spd, two blocks (share col 2). */
+    int A0_rp[2] = {0, 1};
+    int A0_cp[2] = {0, 2};
+    double A0X[4] = {1, 2, 3, 4};
+    matrix *A_blk0 = new_permuted_dense(4, 3, 2, 2, A0_rp, A0_cp, A0X);
+    int A1_rp[2] = {2, 3};
+    int A1_cp[2] = {1, 2};
+    double A1X[4] = {5, 6, 7, 8};
+    matrix *A_blk1 = new_permuted_dense(4, 3, 2, 2, A1_rp, A1_cp, A1X);
+    permuted_dense *A_blocks[2] = {(permuted_dense *) A_blk0,
+                                   (permuted_dense *) A_blk1};
+    matrix *A_spd = new_stacked_pd(4, 3, 2, A_blocks, NULL, NULL);
+
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+
+    matrix *C_ours = BTA_matrices_alloc(A_spd, B);
+    BTA_matrices_fill_values(A_spd, B, C_ours);
+
+    matrix *C_ref = BTA_matrices_alloc(A_spd, B);
+    BTDA_matrices_fill_values(A_spd, d_ones, B, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(C_ours);
+    free_matrix(A_spd);
+    free_matrix(B);
+    return 0;
+}
+
+const char *test_BTA_matrices_fill_spd_csc(void)
+{
+    /* B: 4x3 spd, two blocks (transposed operand). */
+    int B0_rp[2] = {0, 1};
+    int B0_cp[2] = {0, 2};
+    double B0X[4] = {1, 2, 3, 4};
+    matrix *B_blk0 = new_permuted_dense(4, 3, 2, 2, B0_rp, B0_cp, B0X);
+    int B1_rp[2] = {2, 3};
+    int B1_cp[2] = {1, 2};
+    double B1X[4] = {5, 6, 7, 8};
+    matrix *B_blk1 = new_permuted_dense(4, 3, 2, 2, B1_rp, B1_cp, B1X);
+    permuted_dense *B_blocks[2] = {(permuted_dense *) B_blk0,
+                                   (permuted_dense *) B_blk1};
+    matrix *B_spd = new_stacked_pd(4, 3, 2, B_blocks, NULL, NULL);
+
+    /* A: 4x3 sparse_matrix. */
+    CSR_matrix *A_csr = new_CSR_matrix(4, 3, 7);
+    int Ap[5] = {0, 2, 3, 6, 7};
+    int Ai[7] = {0, 2, 1, 0, 1, 2, 2};
+    double Ax[7] = {1, 2, 3, 4, 5, 6, 7};
+    memcpy(A_csr->p, Ap, sizeof(Ap));
+    memcpy(A_csr->i, Ai, sizeof(Ai));
+    memcpy(A_csr->x, Ax, sizeof(Ax));
+    matrix *A_sm = new_sparse_matrix(A_csr);
+
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+
+    matrix *C_ours = BTA_matrices_alloc(A_sm, B_spd);
+    A_sm->refresh_csc_values(A_sm);
+    BTA_matrices_fill_values(A_sm, B_spd, C_ours);
+
+    matrix *C_ref = BTA_matrices_alloc(A_sm, B_spd);
+    A_sm->refresh_csc_values(A_sm);
+    BTDA_matrices_fill_values(A_sm, d_ones, B_spd, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(C_ours);
+    free_matrix(A_sm);
+    free_matrix(B_spd);
+    return 0;
+}
+
+const char *test_BTA_matrices_fill_csc_pd(void)
+{
+    /* B: 4x4 sparse_matrix (transposed operand). */
+    CSR_matrix *B_csr = new_CSR_matrix(4, 4, 5);
+    int Bp[5] = {0, 2, 3, 4, 5};
+    int Bi[5] = {0, 2, 1, 0, 3};
+    double Bx[5] = {10, 20, 30, 40, 50};
+    memcpy(B_csr->p, Bp, sizeof(Bp));
+    memcpy(B_csr->i, Bi, sizeof(Bi));
+    memcpy(B_csr->x, Bx, sizeof(Bx));
+    matrix *B_sm = new_sparse_matrix(B_csr);
+
+    /* A: 4x3 PD. */
+    int A_rp[4] = {0, 1, 2, 3};
+    int A_cp[3] = {0, 1, 2};
+    double AX[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+    matrix *A = new_permuted_dense(4, 3, 4, 3, A_rp, A_cp, AX);
+
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+
+    matrix *C_ours = BTA_matrices_alloc(A, B_sm);
+    B_sm->refresh_csc_values(B_sm);
+    BTA_matrices_fill_values(A, B_sm, C_ours);
+
+    matrix *C_ref = BTA_matrices_alloc(A, B_sm);
+    B_sm->refresh_csc_values(B_sm);
+    BTDA_matrices_fill_values(A, d_ones, B_sm, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(C_ours);
+    free_matrix(A);
+    free_matrix(B_sm);
+    return 0;
+}
+
+const char *test_BTA_matrices_fill_csc_csc(void)
+{
+    /* B: 4x3 sparse_matrix (transposed operand). */
+    CSR_matrix *B_csr = new_CSR_matrix(4, 3, 7);
+    int Bp[5] = {0, 2, 3, 6, 7};
+    int Bi[7] = {0, 2, 1, 0, 1, 2, 2};
+    double Bx[7] = {1, 2, 3, 4, 5, 6, 7};
+    memcpy(B_csr->p, Bp, sizeof(Bp));
+    memcpy(B_csr->i, Bi, sizeof(Bi));
+    memcpy(B_csr->x, Bx, sizeof(Bx));
+    matrix *B_sm = new_sparse_matrix(B_csr);
+
+    /* A: 4x4 sparse_matrix. */
+    CSR_matrix *A_csr = new_CSR_matrix(4, 4, 5);
+    int Ap[5] = {0, 2, 3, 4, 5};
+    int Ai[5] = {0, 2, 1, 0, 3};
+    double Ax[5] = {10, 20, 30, 40, 50};
+    memcpy(A_csr->p, Ap, sizeof(Ap));
+    memcpy(A_csr->i, Ai, sizeof(Ai));
+    memcpy(A_csr->x, Ax, sizeof(Ax));
+    matrix *A_sm = new_sparse_matrix(A_csr);
+
+    double d_ones[4] = {1.0, 1.0, 1.0, 1.0};
+
+    matrix *C_ours = BTA_matrices_alloc(A_sm, B_sm);
+    A_sm->refresh_csc_values(A_sm);
+    B_sm->refresh_csc_values(B_sm);
+    BTA_matrices_fill_values(A_sm, B_sm, C_ours);
+
+    matrix *C_ref = BTA_matrices_alloc(A_sm, B_sm);
+    A_sm->refresh_csc_values(A_sm);
+    B_sm->refresh_csc_values(B_sm);
+    BTDA_matrices_fill_values(A_sm, d_ones, B_sm, C_ref);
+
+    CSR_matrix *csr_ours = C_ours->to_csr(C_ours);
+    CSR_matrix *csr_ref = C_ref->to_csr(C_ref);
+    mu_assert("m", csr_ours->m == csr_ref->m);
+    mu_assert("n", csr_ours->n == csr_ref->n);
+    mu_assert("nnz", csr_ours->nnz == csr_ref->nnz);
+    mu_assert("p", cmp_int_array(csr_ours->p, csr_ref->p, csr_ours->m + 1));
+    mu_assert("i", cmp_int_array(csr_ours->i, csr_ref->i, csr_ours->nnz));
+    mu_assert("x", cmp_double_array(csr_ours->x, csr_ref->x, csr_ours->nnz));
+
+    free_matrix(C_ref);
+    free_matrix(C_ours);
+    free_matrix(A_sm);
+    free_matrix(B_sm);
+    return 0;
+}
+
 /* Regression test for the kron-scratch / transpose-cache crash. Exercises
    BA_dense_kron_spd_fill_values with p > 1: the underlying kron_for_each_
    output_block reuses a single scratch PD across iterations, mutating