Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@
},
"dependencies": {
"@huggingface/transformers": "^3.8.1",
"hdbscan-ts": "^1.0.17",
"js-tiktoken": "^1.0.21",
"umap-js": "^1.4.0",
"vectra": "^0.12.3"
}
}
47 changes: 46 additions & 1 deletion src/commands/testEmbed.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { fetchAllNotes } from '../pipeline/noteReader';
import { benchmark } from '../pipeline/clustering/benchmark';
import { CategorizationConfig } from '../types/cluster';
import { averageVectors, blendVectors, computeTitleWeight, cosineSimilarity } from '../pipeline/vectorAggregator';
import { NoteVector, WorkerMessage } from '../types/embed';
import { isGenericTitle } from '../utils/titleFilter';
Expand Down Expand Up @@ -144,7 +146,50 @@ export const runTestEmbed = async (installDir: string) => {
await cache.endUpdate();

worker.terminate();
log('Worker terminated. Test complete.');
log('Worker terminated. Embedding complete.');

// ── Clustering Benchmark ─────────────────────────────
// Edit this config to compare different algorithms and dimensions.
// Results are printed as a comparison table in the console.
const clusterConfig: CategorizationConfig = {
seed: 42,
metric: 'cosine',
intermediateDim: 10,
intermediateNeighbors: 15,
strategies: [
{ name: 'kmeans-5', algorithm: 'kmeans', K: 5 },
{ name: 'kmedoids-5', algorithm: 'kmedoids', K: 5 },
{ name: 'hdbscan-3', algorithm: 'hdbscan', minClusterSize: 3 },
{ name: 'hdbscan-3-ms2', algorithm: 'hdbscan', minClusterSize: 3, minSamples: 2 },
{ name: 'hdbscan-5-ms2', algorithm: 'hdbscan', minClusterSize: 5, minSamples: 2 },
],
};

if (noteVectors.length >= 3) {
const vectors = noteVectors.map((nv) => nv.vector);
const results = benchmark(vectors, clusterConfig);

// Log note titles per cluster for all strategies, in order (best to worst)
for (const res of results) {
log(`\nCluster assignments (${res.strategyName}):`);
const clusterNotes = new Map<number, string[]>();
for (let i = 0; i < noteVectors.length; i++) {
const c = res.assignments[i];
if (!clusterNotes.has(c)) clusterNotes.set(c, []);
clusterNotes.get(c)!.push(noteVectors[i].title);
}
Comment thread
HahaBill marked this conversation as resolved.
for (const [clusterId, titles] of clusterNotes) {
const label = clusterId < 0 ? 'Noise/Outliers' : `Cluster ${clusterId}`;
log(` ${label} (${titles.length} notes):`);
for (const title of titles) {
log(` - ${title}`);
}
}
Comment thread
HahaBill marked this conversation as resolved.
}
} else {
log('Too few notes for clustering (need at least 3).');
}

return;
}

Expand Down
66 changes: 66 additions & 0 deletions src/pipeline/UmapProjector.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import { UMAP } from 'umap-js';
import { log } from '../utils/logger';
import { mulberry32 } from '../utils/prng';
import { cosineDistance, euclideanDistance } from './clustering/metrics';
import { UmapProjectorOptions } from '../types/projector';

export class UmapProjector {
private readonly nComponents: number;
private readonly nNeighbors: number;
private readonly minDist: number;
private readonly metric: 'cosine' | 'euclidean';
private readonly seed: number;

constructor(options: UmapProjectorOptions = {}) {
this.nComponents = options.nComponents ?? 2;
this.nNeighbors = options.nNeighbors ?? 15;
this.minDist = options.minDist ?? 0.1;
this.metric = options.metric ?? 'cosine';
this.seed = options.seed ?? 42;
}

/**
* Projects high-dimensional vectors to a lower-dimensional space using UMAP.
* @param vectors N vectors of dimension D (N x D)
* @returns N vectors of dimension nComponents
*/
public project(vectors: number[][]): number[][] {
if (vectors.length === 0) return [];

const dim = vectors[0].length;
for (let i = 0; i < vectors.length; i++) {
if (vectors[i].length !== dim) {
throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`);
}
}

// UMAP needs more points than output dimensions to be meaningful
if (vectors.length <= this.nComponents) {
log(`Too few vectors (${vectors.length}) for ${this.nComponents}D projection, padding with zeros.`);
return vectors.map((vec) => {
const out = vec.slice(0, this.nComponents);
while (out.length < this.nComponents) out.push(0);
return out;
});
}

// nNeighbors must be less than the number of data points
const nNeighbors = Math.max(2, Math.min(this.nNeighbors, vectors.length - 1));
const distanceFn = this.metric === 'euclidean' ? euclideanDistance : cosineDistance;

const umap = new UMAP({
nComponents: this.nComponents,
nNeighbors,
minDist: this.minDist,
distanceFn,
random: mulberry32(this.seed),
});

log(
`UMAP: projecting ${vectors.length} vectors (${dim}D → ${this.nComponents}D), ` +
`neighbors=${nNeighbors}, seed=${this.seed}`,
);

return umap.fit(vectors);
}
}
Loading
Loading