|
1 | | -import type { ChartTimeGranularity, EvolutionData } from '~/types/chart' |
2 | | -import { DOWNLOAD_ANOMALIES } from './download-anomalies.data' |
| 1 | +import type { EvolutionData } from '~/types/chart' |
3 | 2 |
|
4 | | -export type DownloadAnomalyBound = { |
5 | | - date: string // YYYY-MM-DD |
6 | | - weeklyDownloads: number |
7 | | -} |
| 3 | +/** |
| 4 | + * Hampel filter for automatic anomaly detection and correction. |
| 5 | + * |
| 6 | + * For each data point, computes the median and Median Absolute Deviation (MAD) |
| 7 | + * of a surrounding window. Points deviating more than `threshold` MADs from |
| 8 | + * the local median are flagged as anomalies and replaced with the median. |
| 9 | + * |
| 10 | + * This approach is unbiased — it applies the same statistical test to every |
| 11 | + * package equally, with no manual curation. |
| 12 | + */ |
8 | 13 |
|
9 | | -export type DownloadAnomaly = { |
10 | | - packageName: string |
11 | | - start: DownloadAnomalyBound |
12 | | - end: DownloadAnomalyBound |
13 | | -} |
| 14 | +const DEFAULT_HALF_WINDOW = 3 |
| 15 | +const DEFAULT_THRESHOLD = 3 |
14 | 16 |
|
15 | | -function getDateString(point: Record<string, any>, granularity: ChartTimeGranularity): string { |
16 | | - switch (granularity) { |
17 | | - case 'daily': |
18 | | - return point.day |
19 | | - case 'weekly': |
20 | | - return point.weekStart |
21 | | - case 'monthly': |
22 | | - return `${point.month}-01` |
23 | | - case 'yearly': |
24 | | - return `${point.year}-01-01` |
25 | | - } |
| 17 | +function median(values: number[]): number { |
| 18 | + const sorted = [...values].sort((a, b) => a - b) |
| 19 | + const mid = Math.floor(sorted.length / 2) |
| 20 | + return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2 |
26 | 21 | } |
27 | 22 |
|
28 | | -/** |
29 | | - * For daily the point date falls strictly between the anomaly bounds. |
30 | | - * For weekly the point date is the week start, and the full 7-day range is |
31 | | - * checked so any overlapping week is affected. |
32 | | - * For monthly/yearly the anomaly bounds are truncated to the same resolution |
33 | | - * so that any period overlapping the anomaly is caught (inclusive). |
34 | | - */ |
35 | | -function isDateAffected( |
36 | | - date: string, |
37 | | - anomaly: DownloadAnomaly, |
38 | | - granularity: ChartTimeGranularity, |
39 | | -): boolean { |
40 | | - switch (granularity) { |
41 | | - case 'daily': |
42 | | - return date > anomaly.start.date && date < anomaly.end.date |
43 | | - case 'weekly': { |
44 | | - const startWeek = date |
45 | | - const weekStartDate = new Date(`${date}T00:00:00Z`) |
46 | | - const weekEndDate = new Date(weekStartDate) |
47 | | - weekEndDate.setUTCDate(weekEndDate.getUTCDate() + 6) |
48 | | - const endWeek = weekEndDate.toISOString().slice(0, 10) |
49 | | - return startWeek < anomaly.end.date && endWeek > anomaly.start.date |
50 | | - } |
51 | | - case 'monthly': { |
52 | | - const monthStart = date |
53 | | - const monthStartDate = new Date(`${date}T00:00:00Z`) |
54 | | - const monthEndDate = new Date(monthStartDate) |
55 | | - monthEndDate.setUTCMonth(monthEndDate.getUTCMonth() + 1) |
56 | | - monthEndDate.setUTCDate(monthEndDate.getUTCDate() - 1) |
57 | | - const monthEnd = monthEndDate.toISOString().slice(0, 10) |
58 | | - return monthStart < anomaly.end.date && monthEnd > anomaly.start.date |
59 | | - } |
60 | | - case 'yearly': { |
61 | | - const yearStart = date |
62 | | - const yearEnd = `${date.slice(0, 4)}-12-31` |
63 | | - return yearStart < anomaly.end.date && yearEnd > anomaly.start.date |
64 | | - } |
65 | | - } |
| 23 | +function mad(values: number[], med: number): number { |
| 24 | + const deviations = values.map(v => Math.abs(v - med)) |
| 25 | + return median(deviations) |
66 | 26 | } |
67 | 27 |
|
68 | | -function scaleWeeklyValue(weeklyValue: number, granularity: ChartTimeGranularity): number { |
69 | | - switch (granularity) { |
70 | | - case 'daily': |
71 | | - return Math.round(weeklyValue / 7) |
72 | | - case 'weekly': |
73 | | - return weeklyValue |
74 | | - case 'monthly': |
75 | | - return Math.round((weeklyValue / 7) * 30) |
76 | | - case 'yearly': |
77 | | - return Math.round((weeklyValue / 7) * 365) |
78 | | - } |
79 | | -} |
| 28 | +export function applyHampelCorrection( |
| 29 | + data: EvolutionData, |
| 30 | + opts?: { halfWindow?: number; threshold?: number }, |
| 31 | +): EvolutionData { |
| 32 | + // halfWindow controls how many neighbors on each side to consider. |
| 33 | + // A window of 3 means we look at 7 points total (3 left + current + 3 right). |
| 34 | + const halfWindow = opts?.halfWindow ?? DEFAULT_HALF_WINDOW |
80 | 35 |
|
81 | | -export function getAnomaliesForPackages( |
82 | | - packageNames: string[], |
83 | | -): { packageName: string; start: string; end: string }[] { |
84 | | - return DOWNLOAD_ANOMALIES.filter(a => packageNames.includes(a.packageName)).map(a => ({ |
85 | | - packageName: a.packageName, |
86 | | - start: a.start.date, |
87 | | - end: a.end.date, |
88 | | - })) |
89 | | -} |
| 36 | + // threshold controls sensitivity. A value of 3 means a point must deviate |
| 37 | + // more than 3 scaled MADs from the local median to be flagged. |
| 38 | + // Higher = less sensitive, lower = more aggressive filtering. |
| 39 | + const threshold = opts?.threshold ?? DEFAULT_THRESHOLD |
90 | 40 |
|
91 | | -export function applyBlocklistCorrection(opts: { |
92 | | - data: EvolutionData |
93 | | - packageName: string |
94 | | - granularity: ChartTimeGranularity |
95 | | -}): EvolutionData { |
96 | | - const { data, packageName, granularity } = opts |
97 | | - const anomalies = DOWNLOAD_ANOMALIES.filter(a => a.packageName === packageName) |
98 | | - if (!anomalies.length) return data |
| 41 | + // Not enough data to form a full window — return as-is. |
| 42 | + if (data.length < halfWindow * 2 + 1) return data |
99 | 43 |
|
100 | | - // Clone to avoid mutation |
| 44 | + const values = (data as Array<{ value: number }>).map(d => d.value) |
| 45 | + // Clone to avoid mutating the original data. |
101 | 46 | const result = (data as Array<Record<string, any>>).map(d => ({ ...d })) |
102 | 47 |
|
103 | | - for (const anomaly of anomalies) { |
104 | | - // Find indices of affected points |
105 | | - const affectedIndices: number[] = [] |
106 | | - for (let i = 0; i < result.length; i++) { |
107 | | - const date = getDateString(result[i]!, granularity) |
108 | | - if (isDateAffected(date, anomaly, granularity)) { |
109 | | - affectedIndices.push(i) |
110 | | - } |
111 | | - } |
| 48 | + for (let i = 0; i < values.length; i++) { |
| 49 | + // Build a sliding window around the current point, clamped to array bounds. |
| 50 | + const start = Math.max(0, i - halfWindow) |
| 51 | + const end = Math.min(values.length - 1, i + halfWindow) |
| 52 | + const window = values.slice(start, end + 1) |
112 | 53 |
|
113 | | - if (!affectedIndices.length) continue |
| 54 | + // The median is robust to outliers — unlike the mean, a single spike |
| 55 | + // won't pull it away from the true central tendency. |
| 56 | + const windowMedian = median(window) |
114 | 57 |
|
115 | | - const firstAffected = affectedIndices[0]! |
116 | | - const lastAffected = affectedIndices[affectedIndices.length - 1]! |
| 58 | + // MAD (Median Absolute Deviation) measures spread without being |
| 59 | + // influenced by the outliers we're trying to detect. |
| 60 | + const windowMad = mad(window, windowMedian) |
117 | 61 |
|
118 | | - // Use neighbors when available, fall back to scaled weeklyDownloads |
119 | | - const scaledStart = scaleWeeklyValue(anomaly.start.weeklyDownloads, granularity) |
120 | | - const scaledEnd = scaleWeeklyValue(anomaly.end.weeklyDownloads, granularity) |
| 62 | + // How far this point is from the local median. |
| 63 | + const deviation = Math.abs(values[i]! - windowMedian) |
121 | 64 |
|
122 | | - const startVal = firstAffected > 0 ? result[firstAffected - 1]!.value : scaledStart |
123 | | - const endVal = lastAffected < result.length - 1 ? result[lastAffected + 1]!.value : scaledEnd |
| 65 | + // MAD of 0 means most values in the window are identical. |
| 66 | + // If this point differs from the median at all, it's an outlier. |
| 67 | + if (windowMad === 0) { |
| 68 | + if (deviation > 0) { |
| 69 | + result[i]!.value = Math.round(windowMedian) |
| 70 | + result[i]!.hasAnomaly = true |
| 71 | + } |
| 72 | + continue |
| 73 | + } |
124 | 74 |
|
125 | | - const count = affectedIndices.length |
126 | | - for (let i = 0; i < count; i++) { |
127 | | - const t = (i + 1) / (count + 1) |
128 | | - result[affectedIndices[i]!]!.value = Math.round(startVal + t * (endVal - startVal)) |
129 | | - result[affectedIndices[i]!]!.hasAnomaly = true |
| 75 | + // Scale MAD to approximate standard deviation using the consistency |
| 76 | + // constant 1.4826 (valid for normally distributed data). |
| 77 | + // The resulting score is essentially "how many standard deviations |
| 78 | + // away from the local median is this point?" |
| 79 | + const score = deviation / (windowMad * 1.4826) |
| 80 | + |
| 81 | + // If the score exceeds the threshold, replace with the median. |
| 82 | + // This corrects the spike while preserving the surrounding trend. |
| 83 | + if (score > threshold) { |
| 84 | + result[i]!.value = Math.round(windowMedian) |
| 85 | + result[i]!.hasAnomaly = true |
130 | 86 | } |
131 | 87 | } |
| 88 | + |
132 | 89 | return result as EvolutionData |
133 | 90 | } |
0 commit comments