Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/utils/matmul_dispatchers.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@

Contract: for sparse_matrix operands (A or B), the caller is responsible for
refreshing csc_cache values via refresh_csc_values before
BTDA_matrices_fill_values. */
BTA_matrices_fill_values / BTDA_matrices_fill_values. */
matrix *BTA_matrices_alloc(matrix *A, matrix *B);
void BTA_matrices_fill_values(matrix *A, matrix *B, matrix *C);
void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C);

/* Polymorphic dispatchers for C = B @ A, where B is permuted_dense and A is any
Expand Down
8 changes: 8 additions & 0 deletions include/utils/permuted_dense_linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,21 @@ void BA_pd_pd_fill_values(const permuted_dense *B, const permuted_dense *A,
/* Allocate new permuted dense for C = B^T @ A where B is PD and A is CSC */
matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A);

/* Fill values of C = B^T @ A where B is PD and A is CSC. */
void BTA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
permuted_dense *C);

/* Fill values of C = B^T @ diag(d) @ A where B is PD and A is CSC */
void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
const CSC_matrix *A, permuted_dense *C);

/* Allocate new permuted_dense for C = B^T @ A where B is Sparse CSC and A is PD. */
matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A);

/* Fill values of C = B^T @ A where B is CSC and A is PD. */
void BTA_csc_pd_fill_values(const CSC_matrix *B, const permuted_dense *A,
permuted_dense *C);

/* Fill values of C = B^T @ diag(d) @ A where B is CSC and A is PD */
void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
const permuted_dense *A, permuted_dense *C);
Expand Down
20 changes: 20 additions & 0 deletions include/utils/stacked_pd_linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ void BTDA_pd_spd_fill_values(const permuted_dense *B, const double *d,
permuted_dense. The output C is stacked_pd. */
matrix *BTA_spd_pd_alloc(const stacked_pd *B, const permuted_dense *A);

/* Fill values of C = B^T @ A where B is stacked_pd, A is permuted_dense, and
C is stacked_pd. */
void BTA_spd_pd_fill_values(const stacked_pd *B, const permuted_dense *A,
stacked_pd *C);

/* Fill values of C = B^T @ diag(d) @ A where B is stacked_pd, A is
permuted_dense, d is a global vector of length B->m, and C is stacked_pd. */
void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d,
Expand All @@ -70,6 +75,11 @@ void BTDA_spd_pd_fill_values(const stacked_pd *B, const double *d,
The output C is stacked_pd. */
matrix *BTA_spd_csc_alloc(const stacked_pd *B, const CSC_matrix *A);

/* Fill values of C = B^T @ A where B is stacked_pd and A is CSC, and C is
stacked_pd. C must be pre-allocated via BTA_spd_csc_alloc. */
void BTA_spd_csc_fill_values(const stacked_pd *B, const CSC_matrix *A,
stacked_pd *C);

/* Fill values of C = B^T @ diag(d) @ A where B is stacked_pd, A is CSC,
d is a global vector of length B->m, and C is stacked_pd. */
void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d,
Expand All @@ -79,6 +89,11 @@ void BTDA_spd_csc_fill_values(const stacked_pd *B, const double *d,
The output C is stacked_pd. */
matrix *BTA_spd_spd_alloc(const stacked_pd *B, const stacked_pd *A);

/* Fill values of C = B^T @ A where both B and A are stacked_pd, and C is
stacked_pd. C must be pre-allocated via BTA_spd_spd_alloc. */
void BTA_spd_spd_fill_values(const stacked_pd *B, const stacked_pd *A,
stacked_pd *C);

/* Fill values of C = B^T @ diag(d) @ A where both B and A are stacked_pd,
d is a global vector of length B->m, and C is stacked_pd. C must be
pre-allocated via BTA_spd_spd_alloc. */
Expand All @@ -89,6 +104,11 @@ void BTDA_spd_spd_fill_values(const stacked_pd *B, const double *d,
The output C is stacked_pd. */
matrix *BTA_csc_spd_alloc(const CSC_matrix *B, const stacked_pd *A);

/* Fill values of C = B^T @ A where B is CSC and A is stacked_pd, and C is
stacked_pd. C must be pre-allocated via BTA_csc_spd_alloc. */
void BTA_csc_spd_fill_values(const CSC_matrix *B, const stacked_pd *A,
stacked_pd *C);

/* Fill values of C = B^T @ diag(d) @ A where B is CSC, A is stacked_pd,
d is a global vector of length A->m, and C is stacked_pd. C must be
pre-allocated via BTA_csc_spd_alloc. */
Expand Down
85 changes: 84 additions & 1 deletion src/utils/matmul_dispatchers.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,20 @@
#include <assert.h>

/* Forward declarations of the fixed-B dispatchers used internally by
BTA_matrices_alloc / BTDA_matrices_fill_values. */
BTA_matrices_alloc / BTA_matrices_fill_values / BTDA_matrices_fill_values. */
static matrix *BTA_pd_matrices_alloc(const permuted_dense *B, matrix *A);
static void BTA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
permuted_dense *C);
static void BTDA_pd_matrices_fill_values(const permuted_dense *B, const double *d,
const matrix *A, permuted_dense *C);
static matrix *BTA_spd_matrices_alloc(const stacked_pd *B, matrix *A);
static void BTA_spd_matrices_fill_values(const stacked_pd *B, const matrix *A,
stacked_pd *C);
static void BTDA_spd_matrices_fill_values(const stacked_pd *B, const double *d,
const matrix *A, stacked_pd *C);
static matrix *BTA_sparse_matrices_alloc(const sparse_matrix *B, matrix *A);
static void BTA_sparse_matrices_fill_values(const sparse_matrix *B, const matrix *A,
matrix *C);
static void BTDA_sparse_matrices_fill_values(const sparse_matrix *B, const double *d,
const matrix *A, matrix *C);

Expand Down Expand Up @@ -65,6 +71,23 @@ void BTDA_matrices_fill_values(matrix *A, const double *d, matrix *B, matrix *C)
BTDA_sparse_matrices_fill_values((const sparse_matrix *) B, d, A, C);
}

void BTA_matrices_fill_values(matrix *A, matrix *B, matrix *C)
{
if (B->is_permuted_dense)
{
BTA_pd_matrices_fill_values((const permuted_dense *) B, A,
(permuted_dense *) C);
return;
}
if (B->is_stacked_pd)
{
BTA_spd_matrices_fill_values((const stacked_pd *) B, A, (stacked_pd *) C);
return;
}
/* B is sparse_matrix */
BTA_sparse_matrices_fill_values((const sparse_matrix *) B, A, C);
}

matrix *BA_pd_matrices_alloc(const permuted_dense *B, matrix *A)
{
if (A->is_permuted_dense)
Expand Down Expand Up @@ -137,6 +160,25 @@ static void BTDA_pd_matrices_fill_values(const permuted_dense *B, const double *
BTDA_pd_csc_fill_values(B, d, sm_A->csc_cache, C);
}

static void BTA_pd_matrices_fill_values(const permuted_dense *B, const matrix *A,
permuted_dense *C)
{
if (A->is_permuted_dense)
{
BTA_pd_pd_fill_values(B, (const permuted_dense *) A, C);
return;
}
if (A->is_stacked_pd)
{
BTA_pd_spd_fill_values(B, (const stacked_pd *) A, C);
return;
}

/* A is sparse */
const sparse_matrix *sm_A = (const sparse_matrix *) A;
BTA_pd_csc_fill_values(B, sm_A->csc_cache, C);
}

static matrix *BTA_spd_matrices_alloc(const stacked_pd *B, matrix *A)
{
if (A->is_permuted_dense)
Expand Down Expand Up @@ -173,6 +215,25 @@ static void BTDA_spd_matrices_fill_values(const stacked_pd *B, const double *d,
BTDA_spd_csc_fill_values(B, d, sm_A->csc_cache, C);
}

static void BTA_spd_matrices_fill_values(const stacked_pd *B, const matrix *A,
stacked_pd *C)
{
if (A->is_permuted_dense)
{
BTA_spd_pd_fill_values(B, (const permuted_dense *) A, C);
return;
}
if (A->is_stacked_pd)
{
BTA_spd_spd_fill_values(B, (const stacked_pd *) A, C);
return;
}

/* A is sparse */
const sparse_matrix *sm_A = (const sparse_matrix *) A;
BTA_spd_csc_fill_values(B, sm_A->csc_cache, C);
}

static matrix *BTA_sparse_matrices_alloc(const sparse_matrix *B, matrix *A)
{
/* Ensure B's csc_cache structure exists. */
Expand Down Expand Up @@ -216,6 +277,28 @@ static void BTDA_sparse_matrices_fill_values(const sparse_matrix *B, const doubl
BTDA_fill_values(sm_A->csc_cache, B->csc_cache, d, ((sparse_matrix *) C)->csr);
}

static void BTA_sparse_matrices_fill_values(const sparse_matrix *B, const matrix *A,
matrix *C)
{
if (A->is_permuted_dense)
{
BTA_csc_pd_fill_values(B->csc_cache, (const permuted_dense *) A,
(permuted_dense *) C);
return;
}
if (A->is_stacked_pd)
{
BTA_csc_spd_fill_values(B->csc_cache, (const stacked_pd *) A,
(stacked_pd *) C);
return;
}

/* A is sparse */
const sparse_matrix *sm_A = (const sparse_matrix *) A;
BTDA_fill_values(sm_A->csc_cache, B->csc_cache, NULL,
((sparse_matrix *) C)->csr);
}

/* Debug-only check that A is the "full" permuted_dense shape the kron
helpers require: m0 == base.m, n0 == base.n, identity inner perms.
Called once at alloc; not repeated at fill (caller would have to
Expand Down
55 changes: 43 additions & 12 deletions src/utils/permuted_dense_linalg.c
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,21 @@ matrix *BTA_pd_csc_alloc(const permuted_dense *B, const CSC_matrix *A)
return C;
}

/* C = B^T @ A = (B^T) A, with B^T materialized into B->kernel_dwork. The no-
diagonal analogue of BTDA_pd_csc_fill_values. */
void BTA_pd_csc_fill_values(const permuted_dense *B, const CSC_matrix *A,
permuted_dense *C)
{
/* C may be empty */
if (C->base.nnz == 0)
{
return;
}

A_transpose(B->kernel_dwork, B->X, B->m0, B->n0);
BA_pd_csc_fill_values(B->kernel_dwork, B->m0, B->row_inv, A, C);
}

/* C = B^T diag(d) A = (diag (d) B)^T A */
void BTDA_pd_csc_fill_values(const permuted_dense *B, const double *d,
const CSC_matrix *A, permuted_dense *C)
Expand Down Expand Up @@ -469,12 +484,12 @@ matrix *BTA_csc_pd_alloc(const CSC_matrix *B, const permuted_dense *A)
return C;
}

/* Internal helper for BTDA_csc_pd_fill_values: C = B^T @ A where B is CSC
and the right operand A is supplied as a transposed-layout raw buffer
(row j of A_T = m0_A contiguous doubles = the j-th column of A's dense
/* Internal core for BTA_csc_pd_fill_values / BTDA_csc_pd_fill_values: C = B^T @ A
where B is CSC and the right operand A is supplied as a transposed-layout raw
buffer (row j of A_T = m0_A contiguous doubles = the j-th column of A's dense
block). Transposed-output sibling of BA_pd_csc_fill_values. */
static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m0_A,
const int *inv, permuted_dense *C)
static void BTA_csc_denseT_fill_values(const CSC_matrix *B, const double *A_T,
int m0_A, const int *inv, permuted_dense *C)
{
/* C[i_C, j_C] = dot(col C->row_perm[i_C] of B, row j_C of A_T). */
for (int i_C = 0; i_C < C->m0; i_C++)
Expand All @@ -491,9 +506,25 @@ static void BTA_csc_pd_fill_values(const CSC_matrix *B, const double *A_T, int m
}
}

/* C = B^T @ A where B is CSC and A is permuted_dense. The csc_pd analogue of
BTDA_csc_pd_fill_values without the diagonal: transpose A's dense block into
A->kernel_dwork (the column-contiguous layout the core wants) and delegate. */
void BTA_csc_pd_fill_values(const CSC_matrix *B, const permuted_dense *A,
permuted_dense *C)
{
if (C->base.nnz == 0)
{
return;
}

/* A->kernel_dwork = X_A^T, row-major shape (n0_A, m0_A). */
A_transpose(A->kernel_dwork, A->X, A->m0, A->n0);
BTA_csc_denseT_fill_values(B, A->kernel_dwork, A->m0, A->row_inv, C);
}

/* C = B^T diag(d) A. Folds diag(d) into A's dense block (writing
(diag(d_perm) X_A)^T into A->kernel_dwork) and delegates to
BTA_csc_pd_fill_values. Mirrors how BTDA_pd_csc_fill_values wraps
BTA_csc_denseT_fill_values. Mirrors how BTDA_pd_csc_fill_values wraps
BA_pd_csc_fill_values. */
void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
const permuted_dense *A, permuted_dense *C)
Expand All @@ -509,7 +540,7 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
/* A->kernel_dwork = (diag(d_perm) X_A)^T, row-major shape (n0_A, m0_A).
Pre-sized by BTA_csc_pd_alloc; no allocation in fill.
Column j of (diag(d) X_A) lives contiguously in dwork as row j —
which is exactly the layout BTA_csc_pd_fill_values wants. */
which is exactly the layout BTA_csc_denseT_fill_values wants. */
for (int kk = 0; kk < m0_A; kk++)
{
double dk = d[A->row_perm[kk]];
Expand All @@ -519,7 +550,7 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
}
}

BTA_csc_pd_fill_values(B, A->kernel_dwork, m0_A, A->row_inv, C);
BTA_csc_denseT_fill_values(B, A->kernel_dwork, m0_A, A->row_inv, C);
}

/* Original transpose-via-Cprime implementation of BTDA_csc_pd_fill_values.
Expand All @@ -528,10 +559,10 @@ void BTDA_csc_pd_fill_values(const CSC_matrix *B, const double *d,
#if defined(__GNUC__) || defined(__clang__)
__attribute__((unused))
#endif
static void BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B,
const double *d,
const permuted_dense *A,
permuted_dense *C)
static void
BTDA_csc_pd_fill_values_via_transpose_dead(const CSC_matrix *B, const double *d,
const permuted_dense *A,
permuted_dense *C)
{
if (C->base.nnz == 0)
{
Expand Down
Loading
Loading