diff --git a/package-lock.json b/package-lock.json index 664cf80..538618a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,7 +10,9 @@ "license": "MIT", "dependencies": { "@huggingface/transformers": "^3.8.1", + "hdbscan-ts": "^1.0.17", "js-tiktoken": "^1.0.21", + "umap-js": "^1.4.0", "vectra": "^0.12.3" }, "devDependencies": { @@ -3072,6 +3074,14 @@ "node": ">= 0.4" } }, + "node_modules/hdbscan-ts": { + "version": "1.0.17", + "resolved": "https://registry.npmjs.org/hdbscan-ts/-/hdbscan-ts-1.0.17.tgz", + "integrity": "sha512-I44z8twblt0FxbiCNVidpQ10lAXKyFwZxRz1CKd9/53qJoDJlJamXoOJF+N1ObhY4xGr1eBUXrfYyxiX9YuIJg==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/htmlparser2": { "version": "10.1.0", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.1.0.tgz", @@ -3225,6 +3235,11 @@ "node": ">= 0.10" } }, + "node_modules/is-any-array": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-0.1.1.tgz", + "integrity": "sha512-qTiELO+kpTKqPgxPYbshMERlzaFu29JDnpB8s3bjg+JkxBpw29/qqSaOdKv2pCdaG92rLGeG/zG2GauX58hfoA==" + }, "node_modules/is-core-module": { "version": "2.16.2", "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.2.tgz", @@ -3632,6 +3647,70 @@ "node": ">=10" } }, + "node_modules/ml-array-max": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ml-array-max/-/ml-array-max-2.0.0.tgz", + "integrity": "sha512-QQZ4kENwpWmyNb98UXRDFXrmtIXuXtt1+bSbda/2KA85+F+rrJP8hZk6QOkCQXM2Th9mUDYdq/PNByPdT9ID4A==", + "dependencies": { + "is-any-array": "^3.0.0" + } + }, + "node_modules/ml-array-max/node_modules/is-any-array": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-3.0.0.tgz", + "integrity": "sha512-o4h+tylWykC4BD1vaejp6gDxoM13bwW8FGuNs4yIKpj8xbBJcRxJx8vZpq0dCr7ZDEfeKjmsi/euolKhX6f/ww==" + }, + "node_modules/ml-array-min": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ml-array-min/-/ml-array-min-2.0.0.tgz", + "integrity": "sha512-GRj6Ky6sW9vGL6yIjgsHmXZ9YgrdmcQ8nCxPqEGeKc6dkfYg1XDYxGFxADUjNuZyoCd5PUscWAS4N+cFaX6hFg==", + "dependencies": { + "is-any-array": "^3.0.0" + } + }, + "node_modules/ml-array-min/node_modules/is-any-array": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-3.0.0.tgz", + "integrity": "sha512-o4h+tylWykC4BD1vaejp6gDxoM13bwW8FGuNs4yIKpj8xbBJcRxJx8vZpq0dCr7ZDEfeKjmsi/euolKhX6f/ww==" + }, + "node_modules/ml-array-rescale": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ml-array-rescale/-/ml-array-rescale-2.0.0.tgz", + "integrity": "sha512-2GGtKfSno94/kIloWGvpp/U5Q5vLvLrza+SAaGsLeo6Xj4mEbA6Gqx+oTfZFkxnd1grT2X007HfJNs3T5BsiVg==", + "dependencies": { + "is-any-array": "^3.0.0", + "ml-array-max": "^2.0.0", + "ml-array-min": "^2.0.0" + } + }, + "node_modules/ml-array-rescale/node_modules/is-any-array": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-3.0.0.tgz", + "integrity": "sha512-o4h+tylWykC4BD1vaejp6gDxoM13bwW8FGuNs4yIKpj8xbBJcRxJx8vZpq0dCr7ZDEfeKjmsi/euolKhX6f/ww==" + }, + "node_modules/ml-levenberg-marquardt": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ml-levenberg-marquardt/-/ml-levenberg-marquardt-2.1.1.tgz", + "integrity": "sha512-2+HwUqew4qFFFYujYlQtmFUrxCB4iJAPqnUYro3P831wj70eJZcANwcRaIMGUVaH9NDKzfYuA4N5u67KExmaRA==", + "dependencies": { + "is-any-array": "^0.1.0", + "ml-matrix": "^6.4.1" + } + }, + "node_modules/ml-matrix": { + "version": "6.12.2", + "resolved": "https://registry.npmjs.org/ml-matrix/-/ml-matrix-6.12.2.tgz", + "integrity": "sha512-GC+BnW+pBh8Auap8goAxY0senAmF0IEoc3HNVSfnfbvGw0buuDIYb9kAKMS1l+GiwJ1rfK2bzJ8IHhwjzATSFA==", + "dependencies": { + "is-any-array": "^3.0.0", + "ml-array-rescale": "^2.0.0" + } + }, + "node_modules/ml-matrix/node_modules/is-any-array": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-any-array/-/is-any-array-3.0.0.tgz", + "integrity": "sha512-o4h+tylWykC4BD1vaejp6gDxoM13bwW8FGuNs4yIKpj8xbBJcRxJx8vZpq0dCr7ZDEfeKjmsi/euolKhX6f/ww==" + }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", @@ -4871,6 +4950,14 @@ "node": ">=4.2.0" } }, + "node_modules/umap-js": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/umap-js/-/umap-js-1.4.0.tgz", + "integrity": "sha512-xxpviF9wUO6Nxrx+C58SoDgea+h2PnVaRPKDelWv0HotmY6BeWeh0kAPJoumfqUkzUvowGsYfMbnsWI0b9do+A==", + "dependencies": { + "ml-levenberg-marquardt": "^2.0.0" + } + }, "node_modules/undici": { "version": "7.26.0", "resolved": "https://registry.npmjs.org/undici/-/undici-7.26.0.tgz", diff --git a/package.json b/package.json index 0164100..76fc5a0 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,9 @@ }, "dependencies": { "@huggingface/transformers": "^3.8.1", + "hdbscan-ts": "^1.0.17", "js-tiktoken": "^1.0.21", + "umap-js": "^1.4.0", "vectra": "^0.12.3" } } diff --git a/src/commands/testEmbed.ts b/src/commands/testEmbed.ts index 0853a5a..744d9a9 100644 --- a/src/commands/testEmbed.ts +++ b/src/commands/testEmbed.ts @@ -1,4 +1,6 @@ import { fetchAllNotes } from '../pipeline/noteReader'; +import { benchmark } from '../pipeline/clustering/benchmark'; +import { CategorizationConfig } from '../types/cluster'; import { averageVectors, blendVectors, computeTitleWeight, cosineSimilarity } from '../pipeline/vectorAggregator'; import { NoteVector, WorkerMessage } from '../types/embed'; import { isGenericTitle } from '../utils/titleFilter'; @@ -144,7 +146,50 @@ export const runTestEmbed = async (installDir: string) => { await cache.endUpdate(); worker.terminate(); - log('Worker terminated. Test complete.'); + log('Worker terminated. Embedding complete.'); + + // ── Clustering Benchmark ───────────────────────────── + // Edit this config to compare different algorithms and dimensions. + // Results are printed as a comparison table in the console. + const clusterConfig: CategorizationConfig = { + seed: 42, + metric: 'cosine', + intermediateDim: 10, + intermediateNeighbors: 15, + strategies: [ + { name: 'kmeans-5', algorithm: 'kmeans', K: 5 }, + { name: 'kmedoids-5', algorithm: 'kmedoids', K: 5 }, + { name: 'hdbscan-3', algorithm: 'hdbscan', minClusterSize: 3 }, + { name: 'hdbscan-3-ms2', algorithm: 'hdbscan', minClusterSize: 3, minSamples: 2 }, + { name: 'hdbscan-5-ms2', algorithm: 'hdbscan', minClusterSize: 5, minSamples: 2 }, + ], + }; + + if (noteVectors.length >= 3) { + const vectors = noteVectors.map((nv) => nv.vector); + const results = benchmark(vectors, clusterConfig); + + // Log note titles per cluster for all strategies, in order (best to worst) + for (const res of results) { + log(`\nCluster assignments (${res.strategyName}):`); + const clusterNotes = new Map(); + for (let i = 0; i < noteVectors.length; i++) { + const c = res.assignments[i]; + if (!clusterNotes.has(c)) clusterNotes.set(c, []); + clusterNotes.get(c)!.push(noteVectors[i].title); + } + for (const [clusterId, titles] of clusterNotes) { + const label = clusterId < 0 ? 'Noise/Outliers' : `Cluster ${clusterId}`; + log(` ${label} (${titles.length} notes):`); + for (const title of titles) { + log(` - ${title}`); + } + } + } + } else { + log('Too few notes for clustering (need at least 3).'); + } + return; } diff --git a/src/pipeline/UmapProjector.ts b/src/pipeline/UmapProjector.ts new file mode 100644 index 0000000..72a4e18 --- /dev/null +++ b/src/pipeline/UmapProjector.ts @@ -0,0 +1,66 @@ +import { UMAP } from 'umap-js'; +import { log } from '../utils/logger'; +import { mulberry32 } from '../utils/prng'; +import { cosineDistance, euclideanDistance } from './clustering/metrics'; +import { UmapProjectorOptions } from '../types/projector'; + +export class UmapProjector { + private readonly nComponents: number; + private readonly nNeighbors: number; + private readonly minDist: number; + private readonly metric: 'cosine' | 'euclidean'; + private readonly seed: number; + + constructor(options: UmapProjectorOptions = {}) { + this.nComponents = options.nComponents ?? 2; + this.nNeighbors = options.nNeighbors ?? 15; + this.minDist = options.minDist ?? 0.1; + this.metric = options.metric ?? 'cosine'; + this.seed = options.seed ?? 42; + } + + /** + * Projects high-dimensional vectors to a lower-dimensional space using UMAP. + * @param vectors N vectors of dimension D (N x D) + * @returns N vectors of dimension nComponents + */ + public project(vectors: number[][]): number[][] { + if (vectors.length === 0) return []; + + const dim = vectors[0].length; + for (let i = 0; i < vectors.length; i++) { + if (vectors[i].length !== dim) { + throw new Error(`Vector at index ${i} has dimension ${vectors[i].length}, expected ${dim}`); + } + } + + // UMAP needs more points than output dimensions to be meaningful + if (vectors.length <= this.nComponents) { + log(`Too few vectors (${vectors.length}) for ${this.nComponents}D projection, padding with zeros.`); + return vectors.map((vec) => { + const out = vec.slice(0, this.nComponents); + while (out.length < this.nComponents) out.push(0); + return out; + }); + } + + // nNeighbors must be less than the number of data points + const nNeighbors = Math.max(2, Math.min(this.nNeighbors, vectors.length - 1)); + const distanceFn = this.metric === 'euclidean' ? euclideanDistance : cosineDistance; + + const umap = new UMAP({ + nComponents: this.nComponents, + nNeighbors, + minDist: this.minDist, + distanceFn, + random: mulberry32(this.seed), + }); + + log( + `UMAP: projecting ${vectors.length} vectors (${dim}D → ${this.nComponents}D), ` + + `neighbors=${nNeighbors}, seed=${this.seed}`, + ); + + return umap.fit(vectors); + } +} diff --git a/src/pipeline/clustering/benchmark.ts b/src/pipeline/clustering/benchmark.ts new file mode 100644 index 0000000..89aeabf --- /dev/null +++ b/src/pipeline/clustering/benchmark.ts @@ -0,0 +1,150 @@ +import { CategorizationConfig, BenchmarkResult, ClusteringStrategy } from '../../types/cluster'; +import { DistanceFn, getDistanceFn, silhouetteScore } from './metrics'; +import { kmeans } from './kmeans'; +import { kmedoids } from './kmedoids'; +import { hdbscan } from './hdbscan'; +import { UmapProjector } from '../UmapProjector'; +import { log } from '../../utils/logger'; + +const DEFAULT_K = 5; +const DEFAULT_MIN_CLUSTER_SIZE = 3; + +/** + * Runs a single clustering strategy and returns the cluster assignments. + */ +function runStrategy(vectors: number[][], strategy: ClusteringStrategy, distFn: DistanceFn, seed: number): number[] { + switch (strategy.algorithm) { + case 'kmeans': + return kmeans(vectors, strategy.K ?? DEFAULT_K, distFn, seed); + case 'kmedoids': + return kmedoids(vectors, strategy.K ?? DEFAULT_K, distFn, seed); + case 'hdbscan': + return hdbscan(vectors, strategy.minClusterSize ?? DEFAULT_MIN_CLUSTER_SIZE, strategy.minSamples, distFn); + default: + throw new Error(`Unknown clustering algorithm: ${strategy.algorithm}`); + } +} + +/** + * Counts the number of points in each cluster (excluding noise points labeled -1). + */ +function computeClusterSizes(assignments: number[]): number[] { + const counts = new Map(); + for (const c of assignments) { + if (c < 0) continue; // skip noise points + counts.set(c, (counts.get(c) ?? 0) + 1); + } + if (counts.size === 0) return []; + const maxCluster = Math.max(...counts.keys()); + const sizes: number[] = []; + for (let c = 0; c <= maxCluster; c++) { + sizes.push(counts.get(c) ?? 0); + } + return sizes; +} + +/** + * Formats the benchmark results as a readable comparison table. + */ +function logBenchmarkTable(results: BenchmarkResult[]): void { + log('=== Clustering Benchmark Results ==='); + log(''); + + const header = `${'Strategy'.padEnd(20)} | ${'Algo'.padEnd(10)} | ${'Cls'.padStart(3)} | ${'Silhouette'.padStart(10)} | ${'Outliers'.padStart(8)} | ${'Time'.padStart(8)} | Cluster Sizes`; + log(header); + log('-'.repeat(header.length + 20)); + + for (const r of results) { + const sizesStr = `[${r.clusterSizes.join(', ')}]`; + log( + `${r.strategyName.padEnd(20)} | ${r.algorithm.padEnd(10)} | ${String(r.clusterCount).padStart(3)} | ${r.silhouetteScore.toFixed(4).padStart(10)} | ${String(r.outlierCount).padStart(8)} | ${(r.timeMs.toFixed(0) + 'ms').padStart(8)} | ${sizesStr}`, + ); + } + + log(''); + if (results.length > 0) { + log(`Best: ${results[0].strategyName} (silhouette = ${results[0].silhouetteScore.toFixed(4)})`); + } + log('==================================='); +} + +/** + * Runs all clustering strategies in the config against the provided vectors, + * computes silhouette scores, and returns results sorted by quality (best first). + * + * If intermediateDim is set, vectors are first UMAP-reduced before clustering. + * The original high-dimensional vectors are passed in; reduction is handled here. + * + * For HDBSCAN, noise points (labeled -1) are excluded from the silhouette calculation + * since they intentionally don't belong to any cluster. + * + * @param vectors High-dimensional note vectors (N x D) + * @param config Categorization config with strategies to benchmark + * @returns Benchmark results sorted by silhouette score (descending) + */ +export function benchmark(vectors: number[][], config: CategorizationConfig): BenchmarkResult[] { + if (vectors.length === 0) { + log('No vectors to cluster.'); + return []; + } + + const distFn = getDistanceFn(config.metric); + + // Optionally reduce dimensionality before clustering + let clusteringVectors = vectors; + if (config.intermediateDim !== null) { + log(`Reducing ${vectors[0].length}D → ${config.intermediateDim}D for clustering...`); + const projector = new UmapProjector({ + nComponents: config.intermediateDim, + nNeighbors: config.intermediateNeighbors, + metric: config.metric, + seed: config.seed, + }); + clusteringVectors = projector.project(vectors); + } + + const results: BenchmarkResult[] = []; + + for (const strategy of config.strategies) { + log(`Running strategy: ${strategy.name} (${strategy.algorithm})...`); + const startTime = performance.now(); + + try { + const assignments = runStrategy(clusteringVectors, strategy, distFn, config.seed); + const timeMs = performance.now() - startTime; + + const outlierCount = assignments.filter((a) => a < 0).length; + const clusterSizes = computeClusterSizes(assignments); + const clusterCount = clusterSizes.filter((s) => s > 0).length; + + // For silhouette, exclude noise points (-1) since they're intentionally unassigned + let score = 0; + if (clusterCount >= 2) { + const clusteredIndices = assignments.map((a, i) => (a >= 0 ? i : -1)).filter((i) => i >= 0); + const clusteredVectors = clusteredIndices.map((i) => clusteringVectors[i]); + const clusteredAssignments = clusteredIndices.map((i) => assignments[i]); + score = silhouetteScore(clusteredVectors, clusteredAssignments, distFn); + } + + results.push({ + strategyName: strategy.name, + algorithm: strategy.algorithm, + clusterCount, + assignments, + clusterSizes, + silhouetteScore: score, + outlierCount, + timeMs, + }); + } catch (err) { + log(`Strategy ${strategy.name} failed: ${err}`); + } + } + + // Sort by silhouette score descending (best first) + results.sort((a, b) => b.silhouetteScore - a.silhouetteScore); + + logBenchmarkTable(results); + + return results; +} diff --git a/src/pipeline/clustering/hdbscan.ts b/src/pipeline/clustering/hdbscan.ts new file mode 100644 index 0000000..681daff --- /dev/null +++ b/src/pipeline/clustering/hdbscan.ts @@ -0,0 +1,61 @@ +import { HDBSCAN } from 'hdbscan-ts'; +import { DistanceFn, euclideanDistance } from './metrics'; + +const DEFAULT_MIN_CLUSTER_SIZE = 3; + +/** + * HDBSCAN clustering using the hdbscan-ts library. + * + * Unlike K-Means, HDBSCAN: + * - Automatically determines the number of clusters + * - Identifies outlier/noise points (labeled -1) + * - Handles clusters of varying densities + * + * Note: hdbscan-ts only supports euclidean distance internally. + * When using cosine distance, we pre-normalize vectors so that + * euclidean distance in the normalized space approximates cosine distance. + * (For unit vectors: euclidean² = 2 * (1 - cosine_similarity)) + * + * @param vectors Input data points (N x D) + * @param minClusterSize Minimum points to form a cluster (default: 3) + * @param minSamples How many neighbors define a "core" point (default: minClusterSize). Lower = fewer outliers + * @param distFn Distance function (used to determine if normalization is needed) + * @returns Cluster assignments (length N). -1 = noise/outlier, 0..K = cluster IDs + */ +export function hdbscan( + vectors: number[][], + minClusterSize: number = DEFAULT_MIN_CLUSTER_SIZE, + minSamples: number | undefined, + distFn: DistanceFn, +): number[] { + const n = vectors.length; + if (n === 0) throw new Error('Cannot cluster empty input'); + if (minClusterSize < 2) throw new Error('minClusterSize must be at least 2'); + if (n < minClusterSize) return new Array(n).fill(-1); + + // hdbscan-ts only supports euclidean distance. If the user chose cosine, + // we L2-normalize the vectors first. In the normalized space, euclidean + // distance is monotonically related to cosine distance. + const isCosine = distFn !== euclideanDistance; + const inputVectors = isCosine ? vectors.map(normalize) : vectors; + + const clusterer = new HDBSCAN({ + minClusterSize, + minSamples: minSamples ?? minClusterSize, + }); + + return clusterer.fit(inputVectors); +} + +/** + * L2-normalizes a vector to unit length. + */ +function normalize(vec: number[]): number[] { + let norm = 0; + for (let i = 0; i < vec.length; i++) { + norm += vec[i] * vec[i]; + } + norm = Math.sqrt(norm); + if (norm === 0) return vec; + return vec.map((v) => v / norm); +} diff --git a/src/pipeline/clustering/kmeans.ts b/src/pipeline/clustering/kmeans.ts new file mode 100644 index 0000000..27a0c32 --- /dev/null +++ b/src/pipeline/clustering/kmeans.ts @@ -0,0 +1,158 @@ +import { DistanceFn } from './metrics'; +import { mulberry32 } from '../../utils/prng'; + +const MAX_ITERATIONS = 100; + +/** + * Selects initial centroids using k-means++ strategy. + * First centroid is chosen randomly; each subsequent centroid is chosen + * with probability proportional to its squared distance from the nearest + * existing centroid. This produces better initial clusters than random selection. + */ +function initCentroids(vectors: number[][], K: number, distFn: DistanceFn, rng: () => number): number[][] { + const n = vectors.length; + const centroids: number[][] = []; + + // First centroid: random point + centroids.push([...vectors[Math.floor(rng() * n)]]); + + for (let c = 1; c < K; c++) { + // Compute squared distance from each point to its nearest centroid + const distances = new Float64Array(n); + let totalDist = 0; + + for (let i = 0; i < n; i++) { + let minDist = Infinity; + for (const centroid of centroids) { + const d = distFn(vectors[i], centroid); + if (d < minDist) minDist = d; + } + distances[i] = minDist * minDist; + totalDist += distances[i]; + } + + // Weighted random selection + let threshold = rng() * totalDist; + let selected = 0; + for (let i = 0; i < n; i++) { + threshold -= distances[i]; + if (threshold <= 0) { + selected = i; + break; + } + } + + centroids.push([...vectors[selected]]); + } + + return centroids; +} + +/** + * Assigns each vector to the index of the nearest centroid. + */ +function assignClusters(vectors: number[][], centroids: number[][], distFn: DistanceFn): number[] { + return vectors.map((vec) => { + let bestCluster = 0; + let bestDist = Infinity; + for (let c = 0; c < centroids.length; c++) { + const d = distFn(vec, centroids[c]); + if (d < bestDist) { + bestDist = d; + bestCluster = c; + } + } + return bestCluster; + }); +} + +/** + * Recomputes centroids as the element-wise mean of assigned points. + * If a cluster is empty, its centroid is re-seeded to a random point. + */ +function recomputeCentroids( + vectors: number[][], + assignments: number[], + K: number, + dim: number, + rng: () => number, +): number[][] { + const centroids: number[][] = Array.from({ length: K }, () => new Array(dim).fill(0)); + const counts = new Array(K).fill(0); + + for (let i = 0; i < vectors.length; i++) { + const c = assignments[i]; + counts[c]++; + for (let d = 0; d < dim; d++) { + centroids[c][d] += vectors[i][d]; + } + } + + for (let c = 0; c < K; c++) { + if (counts[c] === 0) { + // Empty cluster: re-seed to a random point to avoid dead centroids + const idx = Math.floor(rng() * vectors.length); + centroids[c] = [...vectors[idx]]; + } else { + for (let d = 0; d < dim; d++) { + centroids[c][d] /= counts[c]; + } + } + } + + return centroids; +} + +/** + * K-Means clustering using Lloyd's algorithm with k-means++ initialization. + * + * Note: centroid recomputation uses the arithmetic mean, which minimizes the + * squared Euclidean objective. With a non-Euclidean distFn (e.g. cosine) this + * becomes a heuristic — results are still useful but not mathematically optimal. + * + * @param vectors Input data points (N x D) + * @param K Number of clusters + * @param distFn Distance function + * @param seed Seed for reproducible initialization + * @param maxIter Maximum iterations (default: 100) + * @returns Cluster assignment for each vector (length N, values 0..K-1) + */ +export function kmeans( + vectors: number[][], + K: number, + distFn: DistanceFn, + seed: number, + maxIter: number = MAX_ITERATIONS, +): number[] { + const n = vectors.length; + if (n === 0) throw new Error('Cannot cluster empty input'); + if (K <= 0) throw new Error('K must be positive'); + + // If K >= N, each point gets its own cluster + if (K >= n) return vectors.map((_, i) => i); + + const dim = vectors[0].length; + const rng = mulberry32(seed); + + let centroids = initCentroids(vectors, K, distFn, rng); + let assignments = assignClusters(vectors, centroids, distFn); + + for (let iter = 0; iter < maxIter; iter++) { + centroids = recomputeCentroids(vectors, assignments, K, dim, rng); + const newAssignments = assignClusters(vectors, centroids, distFn); + + // Convergence check: stop if no assignments changed + let changed = false; + for (let i = 0; i < n; i++) { + if (newAssignments[i] !== assignments[i]) { + changed = true; + break; + } + } + + assignments = newAssignments; + if (!changed) break; + } + + return assignments; +} diff --git a/src/pipeline/clustering/kmedoids.ts b/src/pipeline/clustering/kmedoids.ts new file mode 100644 index 0000000..565b1a9 --- /dev/null +++ b/src/pipeline/clustering/kmedoids.ts @@ -0,0 +1,131 @@ +import { DistanceFn } from './metrics'; +import { mulberry32 } from '../../utils/prng'; + +const MAX_ITERATIONS = 100; + +/** + * Finds the index of the point that is farthest from any existing medoid. + * Used for greedy medoid initialization (BUILD phase of PAM). + */ +function findFarthestPoint(vectors: number[][], medoidIndices: number[], distFn: DistanceFn): number { + const medoidSet = new Set(medoidIndices); + let bestIdx = 0; + let bestMinDist = -1; + + for (let i = 0; i < vectors.length; i++) { + if (medoidSet.has(i)) continue; + + let minDist = Infinity; + for (const m of medoidIndices) { + const d = distFn(vectors[i], vectors[m]); + if (d < minDist) minDist = d; + } + + if (minDist > bestMinDist) { + bestMinDist = minDist; + bestIdx = i; + } + } + + return bestIdx; +} + +/** + * Assigns each point to the nearest medoid. + */ +function assignToMedoids(vectors: number[][], medoidIndices: number[], distFn: DistanceFn): number[] { + return vectors.map((vec) => { + let bestCluster = 0; + let bestDist = Infinity; + for (let c = 0; c < medoidIndices.length; c++) { + const d = distFn(vec, vectors[medoidIndices[c]]); + if (d < bestDist) { + bestDist = d; + bestCluster = c; + } + } + return bestCluster; + }); +} + +/** + * Computes the total cost (sum of distances from each point to its medoid). + */ +function totalCost(vectors: number[][], assignments: number[], medoidIndices: number[], distFn: DistanceFn): number { + let cost = 0; + for (let i = 0; i < vectors.length; i++) { + cost += distFn(vectors[i], vectors[medoidIndices[assignments[i]]]); + } + return cost; +} + +/** + * K-Medoids clustering using a simplified PAM (Partitioning Around Medoids). + * + * Unlike K-Means, medoids are always actual data points rather than + * computed means. This makes K-Medoids more robust to outliers and + * works naturally with any distance metric (not just Euclidean). + * + * @param vectors Input data points (N x D) + * @param K Number of clusters + * @param distFn Distance function + * @param seed Seed for reproducible initialization + * @param maxIter Maximum iterations (default: 100) + * @returns Cluster assignments (length N, values 0..K-1) + */ +export function kmedoids( + vectors: number[][], + K: number, + distFn: DistanceFn, + seed: number, + maxIter: number = MAX_ITERATIONS, +): number[] { + const n = vectors.length; + if (n === 0) throw new Error('Cannot cluster empty input'); + if (K <= 0) throw new Error('K must be positive'); + if (K >= n) return vectors.map((_, i) => i); + + const rng = mulberry32(seed); + + // BUILD phase: initialize medoids greedily + // First medoid is random, subsequent ones maximize distance from existing medoids + const medoidIndices: number[] = [Math.floor(rng() * n)]; + for (let c = 1; c < K; c++) { + medoidIndices.push(findFarthestPoint(vectors, medoidIndices, distFn)); + } + + let assignments = assignToMedoids(vectors, medoidIndices, distFn); + let currentCost = totalCost(vectors, assignments, medoidIndices, distFn); + + // SWAP phase: try swapping each medoid with each non-medoid + for (let iter = 0; iter < maxIter; iter++) { + let improved = false; + + for (let m = 0; m < K; m++) { + for (let i = 0; i < n; i++) { + if (medoidIndices.includes(i)) continue; + + // Try swapping medoid m with point i + const oldMedoid = medoidIndices[m]; + medoidIndices[m] = i; + + const newAssignments = assignToMedoids(vectors, medoidIndices, distFn); + const newCost = totalCost(vectors, newAssignments, medoidIndices, distFn); + + if (newCost < currentCost) { + // Keep the swap + assignments = newAssignments; + currentCost = newCost; + improved = true; + } else { + // Revert the swap + medoidIndices[m] = oldMedoid; + } + } + } + + if (!improved) break; + } + + return assignments; +} diff --git a/src/pipeline/clustering/metrics.ts b/src/pipeline/clustering/metrics.ts new file mode 100644 index 0000000..9d2588b --- /dev/null +++ b/src/pipeline/clustering/metrics.ts @@ -0,0 +1,89 @@ +export type DistanceFn = (a: number[], b: number[]) => number; + +export function cosineDistance(a: number[], b: number[]): number { + let dot = 0; + let normA = 0; + let normB = 0; + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + const denom = Math.sqrt(normA) * Math.sqrt(normB); + if (denom === 0) return 1; + return 1 - dot / denom; +} + +export function euclideanDistance(a: number[], b: number[]): number { + let sum = 0; + for (let i = 0; i < a.length; i++) { + const d = a[i] - b[i]; + sum += d * d; + } + return Math.sqrt(sum); +} + +export function getDistanceFn(metric: 'cosine' | 'euclidean'): DistanceFn { + return metric === 'euclidean' ? euclideanDistance : cosineDistance; +} + +/** + * Computes the mean silhouette coefficient for a clustering. + * + * For each point i: + * a(i) = mean distance to other points in the same cluster + * b(i) = mean distance to points in the nearest other cluster + * s(i) = (b(i) - a(i)) / max(a(i), b(i)) + * + * Returns the mean of s(i) across all points. + * Range: -1 (poor) to +1 (well-separated clusters). + */ +export function silhouetteScore(vectors: number[][], assignments: number[], distFn: DistanceFn): number { + const n = vectors.length; + if (n <= 1) return 0; + + const uniqueClusters = [...new Set(assignments)]; + if (uniqueClusters.length <= 1) return 0; + + // Group point indices by cluster + const clusterIndices = new Map(); + for (let i = 0; i < n; i++) { + const c = assignments[i]; + if (!clusterIndices.has(c)) clusterIndices.set(c, []); + clusterIndices.get(c)!.push(i); + } + + let totalScore = 0; + + for (let i = 0; i < n; i++) { + const myCluster = assignments[i]; + const myClusterMembers = clusterIndices.get(myCluster)!; + + // a(i): mean distance to same-cluster points + let a = 0; + if (myClusterMembers.length > 1) { + for (const j of myClusterMembers) { + if (j !== i) a += distFn(vectors[i], vectors[j]); + } + a /= myClusterMembers.length - 1; + } + + // b(i): mean distance to nearest other cluster + let b = Infinity; + for (const [clusterId, members] of clusterIndices) { + if (clusterId === myCluster) continue; + let meanDist = 0; + for (const j of members) { + meanDist += distFn(vectors[i], vectors[j]); + } + meanDist /= members.length; + if (meanDist < b) b = meanDist; + } + + const maxAB = Math.max(a, b); + const s = maxAB === 0 ? 0 : (b - a) / maxAB; + totalScore += s; + } + + return totalScore / n; +} diff --git a/src/types/cluster.ts b/src/types/cluster.ts new file mode 100644 index 0000000..52028dd --- /dev/null +++ b/src/types/cluster.ts @@ -0,0 +1,44 @@ +export type ClusteringAlgorithm = 'kmeans' | 'kmedoids' | 'hdbscan'; + +export interface ClusteringStrategy { + /** Human-readable label for this run, e.g. 'kmeans-5' */ + name: string; + algorithm: ClusteringAlgorithm; + /** Number of clusters (kmeans / kmedoids) */ + K?: number; + /** Minimum points to form a cluster (hdbscan, default: 3) */ + minClusterSize?: number; + /** How many neighbors define a "core" point (hdbscan, default: minClusterSize). Lower = fewer outliers */ + minSamples?: number; +} + +export interface CategorizationConfig { + /** Seed for UMAP and clustering reproducibility */ + seed: number; + /** Distance metric for clustering and UMAP */ + metric: 'cosine' | 'euclidean'; + /** + * If set, UMAP-reduce to this dimensionality before clustering. + * null = cluster directly on the raw embedding vectors (e.g. 384D). + */ + intermediateDim: number | null; + /** Number of nearest neighbors for UMAP intermediate projection */ + intermediateNeighbors: number; + /** Clustering strategies to benchmark side-by-side */ + strategies: ClusteringStrategy[]; +} + +export interface BenchmarkResult { + strategyName: string; + algorithm: ClusteringAlgorithm; + clusterCount: number; + /** Cluster ID per note, in the same order as the input vectors */ + assignments: number[]; + /** Number of notes in each cluster, indexed by cluster ID */ + clusterSizes: number[]; + /** Mean silhouette coefficient: -1 (poor) to +1 (excellent) */ + silhouetteScore: number; + /** Number of points classified as noise/outliers (HDBSCAN only) */ + outlierCount: number; + timeMs: number; +} diff --git a/src/types/projector.ts b/src/types/projector.ts new file mode 100644 index 0000000..aa9b972 --- /dev/null +++ b/src/types/projector.ts @@ -0,0 +1,12 @@ +export interface UmapProjectorOptions { + /** Number of dimensions in the output (default: 2) */ + nComponents?: number; + /** Number of nearest neighbors for manifold approximation (default: 15) */ + nNeighbors?: number; + /** Minimum distance between points in output space (default: 0.1) */ + minDist?: number; + /** Distance metric: 'cosine' or 'euclidean' (default: 'cosine') */ + metric?: 'cosine' | 'euclidean'; + /** Seed for reproducible results (default: 42) */ + seed?: number; +} diff --git a/src/utils/prng.ts b/src/utils/prng.ts new file mode 100644 index 0000000..5b36bfc --- /dev/null +++ b/src/utils/prng.ts @@ -0,0 +1,14 @@ +/** + * Mulberry32: a fast, seedable 32-bit PRNG. + * Produces deterministic values in [0, 1) for a given seed. + * Used instead of Math.random() so UMAP and clustering results are reproducible. + */ +export function mulberry32(seed: number): () => number { + return () => { + seed |= 0; + seed = (seed + 0x6d2b79f5) | 0; + let t = Math.imul(seed ^ (seed >>> 15), 1 | seed); + t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t; + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} diff --git a/tsconfig.json b/tsconfig.json index 4474cab..1a5b5bd 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -5,6 +5,7 @@ "target": "es2015", "jsx": "react", "allowJs": true, - "baseUrl": "." + "baseUrl": ".", + "skipLibCheck": true } }