Skip to content

Commit 1b0a0c7

Browse files
committed
Replace manual anomalies with a hampel filter
1 parent d26e250 commit 1b0a0c7

5 files changed

Lines changed: 155 additions & 262 deletions

File tree

app/components/Package/TrendsChart.vue

Lines changed: 8 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ import type {
1919
} from '~/types/chart'
2020
import { DATE_INPUT_MAX } from '~/utils/input'
2121
import { applyDataCorrection } from '~/utils/chart-data-correction'
22-
import { applyBlocklistCorrection, getAnomaliesForPackages } from '~/utils/download-anomalies'
22+
import { applyHampelCorrection } from '~/utils/download-anomalies'
2323
import { copyAltTextForTrendLineChart, sanitise, loadFile } from '~/utils/charts'
2424
2525
import('vue-data-ui/style.css')
@@ -966,11 +966,7 @@ const effectiveDataSingle = computed<EvolutionData>(() => {
966966
if (isDownloadsMetric.value && data.length) {
967967
const pkg = effectivePackageNames.value[0] ?? props.packageName ?? ''
968968
if (settings.value.chartFilter.anomaliesFixed) {
969-
data = applyBlocklistCorrection({
970-
data,
971-
packageName: pkg,
972-
granularity: displayedGranularity.value,
973-
})
969+
data = applyHampelCorrection(data)
974970
}
975971
976972
return applyDataCorrection(
@@ -1019,7 +1015,7 @@ const chartData = computed<{
10191015
let data = state.evolutionsByPackage[pkg] ?? []
10201016
if (isDownloadsMetric.value && data.length) {
10211017
if (settings.value.chartFilter.anomaliesFixed) {
1022-
data = applyBlocklistCorrection({ data, packageName: pkg, granularity })
1018+
data = applyHampelCorrection(data)
10231019
}
10241020
data = applyDataCorrection(
10251021
data as Array<{ value: number }>,
@@ -1681,20 +1677,6 @@ const chartConfig = computed<VueUiXyConfig>(() => {
16811677
const isDownloadsMetric = computed(() => selectedMetric.value === 'downloads')
16821678
const showCorrectionControls = shallowRef(false)
16831679
1684-
const packageAnomalies = computed(() => getAnomaliesForPackages(effectivePackageNames.value))
1685-
const hasAnomalies = computed(() => packageAnomalies.value.length > 0)
1686-
1687-
function formatAnomalyDate(dateStr: string) {
1688-
const [y, m, d] = dateStr.split('-').map(Number)
1689-
if (!y || !m || !d) return dateStr
1690-
return new Intl.DateTimeFormat(locale.value, {
1691-
year: 'numeric',
1692-
month: 'short',
1693-
day: 'numeric',
1694-
timeZone: 'UTC',
1695-
}).format(new Date(Date.UTC(y, m - 1, d)))
1696-
}
1697-
16981680
// Trigger data loading when the metric is switched
16991681
watch(selectedMetric, value => {
17001682
if (!isMounted.value) return
@@ -1831,64 +1813,28 @@ watch(selectedMetric, value => {
18311813
class="text-2xs font-mono text-fg-subtle tracking-wide uppercase flex items-center justify-between"
18321814
>
18331815
{{ $t('package.trends.known_anomalies') }}
1834-
<TooltipApp interactive :to="inModal ? '#chart-modal' : undefined">
1816+
<TooltipApp :to="inModal ? '#chart-modal' : undefined">
18351817
<button
18361818
type="button"
18371819
class="i-lucide:info w-3.5 h-3.5 text-fg-muted cursor-help"
18381820
:aria-label="$t('package.trends.known_anomalies')"
18391821
/>
18401822
<template #content>
1841-
<div class="flex flex-col gap-3">
1842-
<p class="text-xs text-fg-muted">
1843-
{{ $t('package.trends.known_anomalies_description') }}
1844-
</p>
1845-
<div v-if="hasAnomalies">
1846-
<p class="text-xs text-fg-subtle font-medium">
1847-
{{ $t('package.trends.known_anomalies_ranges') }}
1848-
</p>
1849-
<ul class="text-xs text-fg-subtle list-disc list-inside">
1850-
<li v-for="a in packageAnomalies" :key="`${a.packageName}-${a.start}`">
1851-
{{
1852-
isMultiPackageMode
1853-
? $t('package.trends.known_anomalies_range_named', {
1854-
packageName: a.packageName,
1855-
start: formatAnomalyDate(a.start),
1856-
end: formatAnomalyDate(a.end),
1857-
})
1858-
: $t('package.trends.known_anomalies_range', {
1859-
start: formatAnomalyDate(a.start),
1860-
end: formatAnomalyDate(a.end),
1861-
})
1862-
}}
1863-
</li>
1864-
</ul>
1865-
</div>
1866-
<p v-else class="text-xs text-fg-muted">
1867-
{{ $t('package.trends.known_anomalies_none', effectivePackageNames.length) }}
1868-
</p>
1869-
<div class="flex justify-end">
1870-
<LinkBase
1871-
to="https://github.com/npmx-dev/npmx.dev/edit/main/app/utils/download-anomalies.data.ts"
1872-
class="text-xs text-accent"
1873-
>
1874-
{{ $t('package.trends.known_anomalies_contribute') }}
1875-
</LinkBase>
1876-
</div>
1877-
</div>
1823+
<p class="text-xs text-fg-muted">
1824+
{{ $t('package.trends.known_anomalies_description') }}
1825+
</p>
18781826
</template>
18791827
</TooltipApp>
18801828
</span>
18811829
<label
18821830
class="flex items-center gap-1.5 text-2xs font-mono text-fg-subtle cursor-pointer"
1883-
:class="{ 'opacity-50 pointer-events-none': !hasAnomalies }"
18841831
>
18851832
<input
1886-
:checked="settings.chartFilter.anomaliesFixed && hasAnomalies"
1833+
:checked="settings.chartFilter.anomaliesFixed"
18871834
@change="
18881835
settings.chartFilter.anomaliesFixed = ($event.target as HTMLInputElement).checked
18891836
"
18901837
type="checkbox"
1891-
:disabled="!hasAnomalies"
18921838
class="accent-[var(--accent-color,var(--fg-subtle))]"
18931839
/>
18941840
{{ $t('package.trends.apply_correction') }}

app/components/Package/WeeklyDownloadStats.vue

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { useCssVariables } from '~/composables/useColors'
44
import type { WeeklyDataPoint } from '~/types/chart'
55
import { applyDataCorrection } from '~/utils/chart-data-correction'
66
import { OKLCH_NEUTRAL_FALLBACK, lightenOklch } from '~/utils/colors'
7-
import { applyBlocklistCorrection } from '~/utils/download-anomalies'
7+
import { applyHampelCorrection } from '~/utils/download-anomalies'
88
import type { RepoRef } from '#shared/utils/git-providers'
99
import type { VueUiSparklineConfig, VueUiSparklineDatasetItem } from 'vue-data-ui'
1010
import { onKeyDown } from '@vueuse/core'
@@ -186,11 +186,7 @@ const correctedDownloads = computed<WeeklyDataPoint[]>(() => {
186186
let data = weeklyDownloads.value as WeeklyDataPoint[]
187187
if (!data.length) return data
188188
if (settings.value.chartFilter.anomaliesFixed) {
189-
data = applyBlocklistCorrection({
190-
data,
191-
packageName: props.packageName,
192-
granularity: 'weekly',
193-
}) as WeeklyDataPoint[]
189+
data = applyHampelCorrection(data) as WeeklyDataPoint[]
194190
}
195191
data = applyDataCorrection(data, settings.value.chartFilter) as WeeklyDataPoint[]
196192
return data

app/utils/download-anomalies.data.ts

Lines changed: 0 additions & 30 deletions
This file was deleted.

app/utils/download-anomalies.ts

Lines changed: 69 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,90 @@
1-
import type { ChartTimeGranularity, EvolutionData } from '~/types/chart'
2-
import { DOWNLOAD_ANOMALIES } from './download-anomalies.data'
1+
import type { EvolutionData } from '~/types/chart'
32

4-
export type DownloadAnomalyBound = {
5-
date: string // YYYY-MM-DD
6-
weeklyDownloads: number
7-
}
3+
/**
4+
* Hampel filter for automatic anomaly detection and correction.
5+
*
6+
* For each data point, computes the median and Median Absolute Deviation (MAD)
7+
* of a surrounding window. Points deviating more than `threshold` MADs from
8+
* the local median are flagged as anomalies and replaced with the median.
9+
*
10+
* This approach is unbiased — it applies the same statistical test to every
11+
* package equally, with no manual curation.
12+
*/
813

9-
export type DownloadAnomaly = {
10-
packageName: string
11-
start: DownloadAnomalyBound
12-
end: DownloadAnomalyBound
13-
}
14+
const DEFAULT_HALF_WINDOW = 3
15+
const DEFAULT_THRESHOLD = 3
1416

15-
function getDateString(point: Record<string, any>, granularity: ChartTimeGranularity): string {
16-
switch (granularity) {
17-
case 'daily':
18-
return point.day
19-
case 'weekly':
20-
return point.weekStart
21-
case 'monthly':
22-
return `${point.month}-01`
23-
case 'yearly':
24-
return `${point.year}-01-01`
25-
}
17+
function median(values: number[]): number {
18+
const sorted = [...values].sort((a, b) => a - b)
19+
const mid = Math.floor(sorted.length / 2)
20+
return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2
2621
}
2722

28-
/**
29-
* For daily the point date falls strictly between the anomaly bounds.
30-
* For weekly the point date is the week start, and the full 7-day range is
31-
* checked so any overlapping week is affected.
32-
* For monthly/yearly the anomaly bounds are truncated to the same resolution
33-
* so that any period overlapping the anomaly is caught (inclusive).
34-
*/
35-
function isDateAffected(
36-
date: string,
37-
anomaly: DownloadAnomaly,
38-
granularity: ChartTimeGranularity,
39-
): boolean {
40-
switch (granularity) {
41-
case 'daily':
42-
return date > anomaly.start.date && date < anomaly.end.date
43-
case 'weekly': {
44-
const startWeek = date
45-
const weekStartDate = new Date(`${date}T00:00:00Z`)
46-
const weekEndDate = new Date(weekStartDate)
47-
weekEndDate.setUTCDate(weekEndDate.getUTCDate() + 6)
48-
const endWeek = weekEndDate.toISOString().slice(0, 10)
49-
return startWeek <= anomaly.end.date && endWeek >= anomaly.start.date
50-
}
51-
case 'monthly': {
52-
const startMonth = anomaly.start.date.slice(0, 7) + '-01'
53-
const endMonth = anomaly.end.date.slice(0, 7) + '-01'
54-
return date >= startMonth && date <= endMonth
55-
}
56-
case 'yearly': {
57-
const startYear = anomaly.start.date.slice(0, 4) + '-01-01'
58-
const endYear = anomaly.end.date.slice(0, 4) + '-01-01'
59-
return date >= startYear && date <= endYear
60-
}
61-
}
23+
function mad(values: number[], med: number): number {
24+
const deviations = values.map(v => Math.abs(v - med))
25+
return median(deviations)
6226
}
6327

64-
function scaleWeeklyValue(weeklyValue: number, granularity: ChartTimeGranularity): number {
65-
switch (granularity) {
66-
case 'daily':
67-
return Math.round(weeklyValue / 7)
68-
case 'weekly':
69-
return weeklyValue
70-
case 'monthly':
71-
return Math.round((weeklyValue / 7) * 30)
72-
case 'yearly':
73-
return Math.round((weeklyValue / 7) * 365)
74-
}
75-
}
28+
export function applyHampelCorrection(
29+
data: EvolutionData,
30+
opts?: { halfWindow?: number; threshold?: number },
31+
): EvolutionData {
32+
// halfWindow controls how many neighbors on each side to consider.
33+
// A window of 3 means we look at 7 points total (3 left + current + 3 right).
34+
const halfWindow = opts?.halfWindow ?? DEFAULT_HALF_WINDOW
7635

77-
export function getAnomaliesForPackages(
78-
packageNames: string[],
79-
): { packageName: string; start: string; end: string }[] {
80-
return DOWNLOAD_ANOMALIES.filter(a => packageNames.includes(a.packageName)).map(a => ({
81-
packageName: a.packageName,
82-
start: a.start.date,
83-
end: a.end.date,
84-
}))
85-
}
36+
// threshold controls sensitivity. A value of 3 means a point must deviate
37+
// more than 3 scaled MADs from the local median to be flagged.
38+
// Higher = less sensitive, lower = more aggressive filtering.
39+
const threshold = opts?.threshold ?? DEFAULT_THRESHOLD
8640

87-
export function applyBlocklistCorrection(opts: {
88-
data: EvolutionData
89-
packageName: string
90-
granularity: ChartTimeGranularity
91-
}): EvolutionData {
92-
const { data, packageName, granularity } = opts
93-
const anomalies = DOWNLOAD_ANOMALIES.filter(a => a.packageName === packageName)
94-
if (!anomalies.length) return data
41+
// Not enough data to form a full window — return as-is.
42+
if (data.length < halfWindow * 2 + 1) return data
9543

96-
// Clone to avoid mutation
44+
const values = (data as Array<{ value: number }>).map(d => d.value)
45+
// Clone to avoid mutating the original data.
9746
const result = (data as Array<Record<string, any>>).map(d => ({ ...d }))
9847

99-
for (const anomaly of anomalies) {
100-
// Find indices of affected points
101-
const affectedIndices: number[] = []
102-
for (let i = 0; i < result.length; i++) {
103-
const date = getDateString(result[i]!, granularity)
104-
if (isDateAffected(date, anomaly, granularity)) {
105-
affectedIndices.push(i)
106-
}
107-
}
48+
for (let i = 0; i < values.length; i++) {
49+
// Build a sliding window around the current point, clamped to array bounds.
50+
const start = Math.max(0, i - halfWindow)
51+
const end = Math.min(values.length - 1, i + halfWindow)
52+
const window = values.slice(start, end + 1)
10853

109-
if (!affectedIndices.length) continue
54+
// The median is robust to outliers — unlike the mean, a single spike
55+
// won't pull it away from the true central tendency.
56+
const windowMedian = median(window)
11057

111-
const firstAffected = affectedIndices[0]!
112-
const lastAffected = affectedIndices[affectedIndices.length - 1]!
58+
// MAD (Median Absolute Deviation) measures spread without being
59+
// influenced by the outliers we're trying to detect.
60+
const windowMad = mad(window, windowMedian)
11361

114-
// Use neighbors when available, fall back to scaled weeklyDownloads
115-
const scaledStart = scaleWeeklyValue(anomaly.start.weeklyDownloads, granularity)
116-
const scaledEnd = scaleWeeklyValue(anomaly.end.weeklyDownloads, granularity)
62+
// How far this point is from the local median.
63+
const deviation = Math.abs(values[i]! - windowMedian)
11764

118-
const startVal = firstAffected > 0 ? result[firstAffected - 1]!.value : scaledStart
119-
const endVal = lastAffected < result.length - 1 ? result[lastAffected + 1]!.value : scaledEnd
65+
// MAD of 0 means most values in the window are identical.
66+
// If this point differs from the median at all, it's an outlier.
67+
if (windowMad === 0) {
68+
if (deviation > 0) {
69+
result[i]!.value = Math.round(windowMedian)
70+
result[i]!.hasAnomaly = true
71+
}
72+
continue
73+
}
12074

121-
const count = affectedIndices.length
122-
for (let i = 0; i < count; i++) {
123-
const t = (i + 1) / (count + 1)
124-
result[affectedIndices[i]!]!.value = Math.round(startVal + t * (endVal - startVal))
125-
result[affectedIndices[i]!]!.hasAnomaly = true
75+
// Scale MAD to approximate standard deviation using the consistency
76+
// constant 1.4826 (valid for normally distributed data).
77+
// The resulting score is essentially "how many standard deviations
78+
// away from the local median is this point?"
79+
const score = deviation / (windowMad * 1.4826)
80+
81+
// If the score exceeds the threshold, replace with the median.
82+
// This corrects the spike while preserving the surrounding trend.
83+
if (score > threshold) {
84+
result[i]!.value = Math.round(windowMedian)
85+
result[i]!.hasAnomaly = true
12686
}
12787
}
88+
12889
return result as EvolutionData
12990
}

0 commit comments

Comments
 (0)