Skip to content

Commit a29c7d3

Browse files
committed
Replace manual anomalies with a hampel filter
1 parent d3cfce5 commit a29c7d3

5 files changed

Lines changed: 135 additions & 365 deletions

File tree

app/components/Package/TrendsChart.vue

Lines changed: 8 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import {
2323
endDateOnlyToUtcMs,
2424
DEFAULT_PREDICTION_POINTS,
2525
} from '~/utils/chart-data-prediction'
26-
import { applyBlocklistCorrection, getAnomaliesForPackages } from '~/utils/download-anomalies'
26+
import { applyHampelCorrection } from '~/utils/download-anomalies'
2727
import { copyAltTextForTrendLineChart, sanitise, loadFile, applyEllipsis } from '~/utils/charts'
2828
2929
import('vue-data-ui/style.css')
@@ -978,11 +978,7 @@ const effectiveDataSingle = computed<EvolutionData>(() => {
978978
if (isDownloadsMetric.value && data.length) {
979979
const pkg = effectivePackageNames.value[0] ?? props.packageName ?? ''
980980
if (settings.value.chartFilter.anomaliesFixed) {
981-
data = applyBlocklistCorrection({
982-
data,
983-
packageName: pkg,
984-
granularity: displayedGranularity.value,
985-
})
981+
data = applyHampelCorrection(data)
986982
}
987983
}
988984
@@ -1026,7 +1022,7 @@ const chartData = computed<{
10261022
let data = state.evolutionsByPackage[pkg] ?? []
10271023
if (isDownloadsMetric.value && data.length) {
10281024
if (settings.value.chartFilter.anomaliesFixed) {
1029-
data = applyBlocklistCorrection({ data, packageName: pkg, granularity })
1025+
data = applyHampelCorrection(data)
10301026
}
10311027
}
10321028
const points = extractSeriesPoints(granularity, data)
@@ -1556,20 +1552,6 @@ const chartConfig = computed<VueUiXyConfig>(() => {
15561552
const isDownloadsMetric = computed(() => selectedMetric.value === 'downloads')
15571553
const showCorrectionControls = shallowRef(false)
15581554
1559-
const packageAnomalies = computed(() => getAnomaliesForPackages(effectivePackageNames.value))
1560-
const hasAnomalies = computed(() => packageAnomalies.value.length > 0)
1561-
1562-
function formatAnomalyDate(dateStr: string) {
1563-
const [y, m, d] = dateStr.split('-').map(Number)
1564-
if (!y || !m || !d) return dateStr
1565-
return new Intl.DateTimeFormat(locale.value, {
1566-
year: 'numeric',
1567-
month: 'short',
1568-
day: 'numeric',
1569-
timeZone: 'UTC',
1570-
}).format(new Date(Date.UTC(y, m - 1, d)))
1571-
}
1572-
15731555
// Trigger data loading when the metric is switched
15741556
watch(selectedMetric, value => {
15751557
if (!isMounted.value) return
@@ -1722,64 +1704,28 @@ watch(selectedMetric, value => {
17221704
class="text-2xs font-mono text-fg-subtle tracking-wide uppercase flex items-center justify-between"
17231705
>
17241706
{{ $t('package.trends.known_anomalies') }}
1725-
<TooltipApp interactive :to="inModal ? '#chart-modal' : undefined">
1707+
<TooltipApp :to="inModal ? '#chart-modal' : undefined">
17261708
<button
17271709
type="button"
17281710
class="i-lucide:info w-3.5 h-3.5 text-fg-muted cursor-help"
17291711
:aria-label="$t('package.trends.known_anomalies')"
17301712
/>
17311713
<template #content>
1732-
<div class="flex flex-col gap-3">
1733-
<p class="text-xs text-fg-muted">
1734-
{{ $t('package.trends.known_anomalies_description') }}
1735-
</p>
1736-
<div v-if="hasAnomalies">
1737-
<p class="text-xs text-fg-subtle font-medium">
1738-
{{ $t('package.trends.known_anomalies_ranges') }}
1739-
</p>
1740-
<ul class="text-xs text-fg-subtle list-disc list-inside">
1741-
<li v-for="a in packageAnomalies" :key="`${a.packageName}-${a.start}`">
1742-
{{
1743-
isMultiPackageMode
1744-
? $t('package.trends.known_anomalies_range_named', {
1745-
packageName: a.packageName,
1746-
start: formatAnomalyDate(a.start),
1747-
end: formatAnomalyDate(a.end),
1748-
})
1749-
: $t('package.trends.known_anomalies_range', {
1750-
start: formatAnomalyDate(a.start),
1751-
end: formatAnomalyDate(a.end),
1752-
})
1753-
}}
1754-
</li>
1755-
</ul>
1756-
</div>
1757-
<p v-else class="text-xs text-fg-muted">
1758-
{{ $t('package.trends.known_anomalies_none', effectivePackageNames.length) }}
1759-
</p>
1760-
<div class="flex justify-end">
1761-
<LinkBase
1762-
to="https://github.com/npmx-dev/npmx.dev/edit/main/app/utils/download-anomalies.data.ts"
1763-
class="text-xs text-accent"
1764-
>
1765-
{{ $t('package.trends.known_anomalies_contribute') }}
1766-
</LinkBase>
1767-
</div>
1768-
</div>
1714+
<p class="text-xs text-fg-muted">
1715+
{{ $t('package.trends.known_anomalies_description') }}
1716+
</p>
17691717
</template>
17701718
</TooltipApp>
17711719
</span>
17721720
<label
17731721
class="flex items-center gap-1.5 text-2xs font-mono text-fg-subtle cursor-pointer h-4"
1774-
:class="{ 'opacity-50 pointer-events-none': !hasAnomalies }"
17751722
>
17761723
<input
1777-
:checked="settings.chartFilter.anomaliesFixed && hasAnomalies"
1724+
:checked="settings.chartFilter.anomaliesFixed"
17781725
@change="
17791726
settings.chartFilter.anomaliesFixed = ($event.target as HTMLInputElement).checked
17801727
"
17811728
type="checkbox"
1782-
:disabled="!hasAnomalies"
17831729
class="accent-[var(--accent-color,var(--fg-subtle))]"
17841730
/>
17851731
{{ $t('package.trends.apply_correction') }}

app/components/Package/WeeklyDownloadStats.vue

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { useCssVariables } from '~/composables/useColors'
44
import type { WeeklyDataPoint } from '~/types/chart'
55
import { applyDataCorrection } from '~/utils/chart-data-correction'
66
import { OKLCH_NEUTRAL_FALLBACK, lightenOklch } from '~/utils/colors'
7-
import { applyBlocklistCorrection } from '~/utils/download-anomalies'
7+
import { applyHampelCorrection } from '~/utils/download-anomalies'
88
import type { RepoRef } from '#shared/utils/git-providers'
99
import type { VueUiSparklineConfig, VueUiSparklineDatasetItem } from 'vue-data-ui'
1010
import { onKeyDown } from '@vueuse/core'
@@ -186,11 +186,7 @@ const correctedDownloads = computed<WeeklyDataPoint[]>(() => {
186186
let data = weeklyDownloads.value as WeeklyDataPoint[]
187187
if (!data.length) return data
188188
if (settings.value.chartFilter.anomaliesFixed) {
189-
data = applyBlocklistCorrection({
190-
data,
191-
packageName: props.packageName,
192-
granularity: 'weekly',
193-
}) as WeeklyDataPoint[]
189+
data = applyHampelCorrection(data) as WeeklyDataPoint[]
194190
}
195191
data = applyDataCorrection(data, settings.value.chartFilter) as WeeklyDataPoint[]
196192
return data

app/utils/download-anomalies.data.ts

Lines changed: 0 additions & 30 deletions
This file was deleted.

app/utils/download-anomalies.ts

Lines changed: 69 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,133 +1,90 @@
1-
import type { ChartTimeGranularity, EvolutionData } from '~/types/chart'
2-
import { DOWNLOAD_ANOMALIES } from './download-anomalies.data'
1+
import type { EvolutionData } from '~/types/chart'
32

4-
export type DownloadAnomalyBound = {
5-
date: string // YYYY-MM-DD
6-
weeklyDownloads: number
7-
}
3+
/**
4+
* Hampel filter for automatic anomaly detection and correction.
5+
*
6+
* For each data point, computes the median and Median Absolute Deviation (MAD)
7+
* of a surrounding window. Points deviating more than `threshold` MADs from
8+
* the local median are flagged as anomalies and replaced with the median.
9+
*
10+
* This approach is unbiased — it applies the same statistical test to every
11+
* package equally, with no manual curation.
12+
*/
813

9-
export type DownloadAnomaly = {
10-
packageName: string
11-
start: DownloadAnomalyBound
12-
end: DownloadAnomalyBound
13-
}
14+
const DEFAULT_HALF_WINDOW = 3
15+
const DEFAULT_THRESHOLD = 3
1416

15-
function getDateString(point: Record<string, any>, granularity: ChartTimeGranularity): string {
16-
switch (granularity) {
17-
case 'daily':
18-
return point.day
19-
case 'weekly':
20-
return point.weekStart
21-
case 'monthly':
22-
return `${point.month}-01`
23-
case 'yearly':
24-
return `${point.year}-01-01`
25-
}
17+
function median(values: number[]): number {
18+
const sorted = [...values].sort((a, b) => a - b)
19+
const mid = Math.floor(sorted.length / 2)
20+
return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2
2621
}
2722

28-
/**
29-
* For daily the point date falls strictly between the anomaly bounds.
30-
* For weekly the point date is the week start, and the full 7-day range is
31-
* checked so any overlapping week is affected.
32-
* For monthly/yearly the anomaly bounds are truncated to the same resolution
33-
* so that any period overlapping the anomaly is caught (inclusive).
34-
*/
35-
function isDateAffected(
36-
date: string,
37-
anomaly: DownloadAnomaly,
38-
granularity: ChartTimeGranularity,
39-
): boolean {
40-
switch (granularity) {
41-
case 'daily':
42-
return date > anomaly.start.date && date < anomaly.end.date
43-
case 'weekly': {
44-
const startWeek = date
45-
const weekStartDate = new Date(`${date}T00:00:00Z`)
46-
const weekEndDate = new Date(weekStartDate)
47-
weekEndDate.setUTCDate(weekEndDate.getUTCDate() + 6)
48-
const endWeek = weekEndDate.toISOString().slice(0, 10)
49-
return startWeek < anomaly.end.date && endWeek > anomaly.start.date
50-
}
51-
case 'monthly': {
52-
const monthStart = date
53-
const monthStartDate = new Date(`${date}T00:00:00Z`)
54-
const monthEndDate = new Date(monthStartDate)
55-
monthEndDate.setUTCMonth(monthEndDate.getUTCMonth() + 1)
56-
monthEndDate.setUTCDate(monthEndDate.getUTCDate() - 1)
57-
const monthEnd = monthEndDate.toISOString().slice(0, 10)
58-
return monthStart < anomaly.end.date && monthEnd > anomaly.start.date
59-
}
60-
case 'yearly': {
61-
const yearStart = date
62-
const yearEnd = `${date.slice(0, 4)}-12-31`
63-
return yearStart < anomaly.end.date && yearEnd > anomaly.start.date
64-
}
65-
}
23+
function mad(values: number[], med: number): number {
24+
const deviations = values.map(v => Math.abs(v - med))
25+
return median(deviations)
6626
}
6727

68-
function scaleWeeklyValue(weeklyValue: number, granularity: ChartTimeGranularity): number {
69-
switch (granularity) {
70-
case 'daily':
71-
return Math.round(weeklyValue / 7)
72-
case 'weekly':
73-
return weeklyValue
74-
case 'monthly':
75-
return Math.round((weeklyValue / 7) * 30)
76-
case 'yearly':
77-
return Math.round((weeklyValue / 7) * 365)
78-
}
79-
}
28+
export function applyHampelCorrection(
29+
data: EvolutionData,
30+
opts?: { halfWindow?: number; threshold?: number },
31+
): EvolutionData {
32+
// halfWindow controls how many neighbors on each side to consider.
33+
// A window of 3 means we look at 7 points total (3 left + current + 3 right).
34+
const halfWindow = opts?.halfWindow ?? DEFAULT_HALF_WINDOW
8035

81-
export function getAnomaliesForPackages(
82-
packageNames: string[],
83-
): { packageName: string; start: string; end: string }[] {
84-
return DOWNLOAD_ANOMALIES.filter(a => packageNames.includes(a.packageName)).map(a => ({
85-
packageName: a.packageName,
86-
start: a.start.date,
87-
end: a.end.date,
88-
}))
89-
}
36+
// threshold controls sensitivity. A value of 3 means a point must deviate
37+
// more than 3 scaled MADs from the local median to be flagged.
38+
// Higher = less sensitive, lower = more aggressive filtering.
39+
const threshold = opts?.threshold ?? DEFAULT_THRESHOLD
9040

91-
export function applyBlocklistCorrection(opts: {
92-
data: EvolutionData
93-
packageName: string
94-
granularity: ChartTimeGranularity
95-
}): EvolutionData {
96-
const { data, packageName, granularity } = opts
97-
const anomalies = DOWNLOAD_ANOMALIES.filter(a => a.packageName === packageName)
98-
if (!anomalies.length) return data
41+
// Not enough data to form a full window — return as-is.
42+
if (data.length < halfWindow * 2 + 1) return data
9943

100-
// Clone to avoid mutation
44+
const values = (data as Array<{ value: number }>).map(d => d.value)
45+
// Clone to avoid mutating the original data.
10146
const result = (data as Array<Record<string, any>>).map(d => ({ ...d }))
10247

103-
for (const anomaly of anomalies) {
104-
// Find indices of affected points
105-
const affectedIndices: number[] = []
106-
for (let i = 0; i < result.length; i++) {
107-
const date = getDateString(result[i]!, granularity)
108-
if (isDateAffected(date, anomaly, granularity)) {
109-
affectedIndices.push(i)
110-
}
111-
}
48+
for (let i = 0; i < values.length; i++) {
49+
// Build a sliding window around the current point, clamped to array bounds.
50+
const start = Math.max(0, i - halfWindow)
51+
const end = Math.min(values.length - 1, i + halfWindow)
52+
const window = values.slice(start, end + 1)
11253

113-
if (!affectedIndices.length) continue
54+
// The median is robust to outliers — unlike the mean, a single spike
55+
// won't pull it away from the true central tendency.
56+
const windowMedian = median(window)
11457

115-
const firstAffected = affectedIndices[0]!
116-
const lastAffected = affectedIndices[affectedIndices.length - 1]!
58+
// MAD (Median Absolute Deviation) measures spread without being
59+
// influenced by the outliers we're trying to detect.
60+
const windowMad = mad(window, windowMedian)
11761

118-
// Use neighbors when available, fall back to scaled weeklyDownloads
119-
const scaledStart = scaleWeeklyValue(anomaly.start.weeklyDownloads, granularity)
120-
const scaledEnd = scaleWeeklyValue(anomaly.end.weeklyDownloads, granularity)
62+
// How far this point is from the local median.
63+
const deviation = Math.abs(values[i]! - windowMedian)
12164

122-
const startVal = firstAffected > 0 ? result[firstAffected - 1]!.value : scaledStart
123-
const endVal = lastAffected < result.length - 1 ? result[lastAffected + 1]!.value : scaledEnd
65+
// MAD of 0 means most values in the window are identical.
66+
// If this point differs from the median at all, it's an outlier.
67+
if (windowMad === 0) {
68+
if (deviation > 0) {
69+
result[i]!.value = Math.round(windowMedian)
70+
result[i]!.hasAnomaly = true
71+
}
72+
continue
73+
}
12474

125-
const count = affectedIndices.length
126-
for (let i = 0; i < count; i++) {
127-
const t = (i + 1) / (count + 1)
128-
result[affectedIndices[i]!]!.value = Math.round(startVal + t * (endVal - startVal))
129-
result[affectedIndices[i]!]!.hasAnomaly = true
75+
// Scale MAD to approximate standard deviation using the consistency
76+
// constant 1.4826 (valid for normally distributed data).
77+
// The resulting score is essentially "how many standard deviations
78+
// away from the local median is this point?"
79+
const score = deviation / (windowMad * 1.4826)
80+
81+
// If the score exceeds the threshold, replace with the median.
82+
// This corrects the spike while preserving the surrounding trend.
83+
if (score > threshold) {
84+
result[i]!.value = Math.round(windowMedian)
85+
result[i]!.hasAnomaly = true
13086
}
13187
}
88+
13289
return result as EvolutionData
13390
}

0 commit comments

Comments
 (0)