From 2bf55a0ef5573eeab20d5cff8d24f5a969938743 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 2 Jun 2026 17:12:55 -0400 Subject: [PATCH 1/5] Add dataset vintage profiles as the single source of truth for source years Source release years were declared as literal defaults in three places (the provider signature, the checkpoint signature, and the CLI), so they could drift from the real build: cps_source_year defaulted to 2023 (income year 2022) while every production build overrode it to 2025 via a shell flag. The stale literal sat in three signatures and failed open -- nothing errored. Introduce microplex_us.vintages. A DatasetProfile declares, in ONE place, the model year a dataset represents and each source's release plus how its dollars reach that year (native, or aged with a component-specific factor family). A coherence check asserts every source reaches model_year or declares an explicit gap_reason. MP_2024 is the current 2024 base dataset: CPS ASEC 2025 (income year 2024) native spine, PUF 2015->2024 via SOI factors, ACS 2024, SIPP 2023->2024, SCF 2022->2024. Thread the year defaults through MP_2024 so the value is defined once and the safe path is the only path; the stale CPS default becomes the profile's 2025. A regression guard asserts the provider defaults derive from the profile. Foundation for codex: follow-ups are to drop the per-call --*-year args in favor of `--profile`, and add a build-time gate that checks a produced artifact against the active profile (freshness vs latest release + basis coherence). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../pipelines/pe_us_data_rebuild.py | 12 +- .../pe_us_data_rebuild_checkpoint.py | 19 +- src/microplex_us/vintages.py | 165 ++++++++++++++++++ tests/pipelines/test_pe_us_data_rebuild.py | 19 ++ tests/test_vintages.py | 103 +++++++++++ 5 files changed, 305 insertions(+), 13 deletions(-) create mode 100644 src/microplex_us/vintages.py create mode 100644 tests/test_vintages.py diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild.py b/src/microplex_us/pipelines/pe_us_data_rebuild.py index d09e95d..d89645e 100644 --- a/src/microplex_us/pipelines/pe_us_data_rebuild.py +++ b/src/microplex_us/pipelines/pe_us_data_rebuild.py @@ -7,6 +7,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +from microplex_us.vintages import MP_2024 + if TYPE_CHECKING: from microplex.core import SourceProvider @@ -95,10 +97,10 @@ def default_policyengine_us_data_rebuild_config( def default_policyengine_us_data_rebuild_source_providers( *, - cps_source_year: int = 2023, + cps_source_year: int = MP_2024.cps_asec.release, cps_cache_dir: str | Path | None = None, cps_download: bool = True, - puf_target_year: int = 2024, + puf_target_year: int = MP_2024.model_year, puf_cps_reference_year: int | None = None, puf_cache_dir: str | Path | None = None, puf_path: str | Path | None = None, @@ -107,9 +109,9 @@ def default_policyengine_us_data_rebuild_source_providers( include_donor_surveys: bool = True, include_sipp: bool | None = None, include_scf: bool | None = None, - acs_year: int = 2024, - sipp_year: int = 2023, - scf_year: int = 2022, + acs_year: int = MP_2024.acs.release, + sipp_year: int = MP_2024.sipp.release, + scf_year: int = MP_2024.scf.release, donor_cache_dir: str | Path | None = None, policyengine_us_data_repo: str | Path | None = None, policyengine_us_data_python: str | Path | None = None, diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py index a37f993..0fa0d89 100644 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py +++ b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py @@ -64,6 +64,7 @@ write_us_stage_run_manifests_from_artifact_manifest, ) from microplex_us.variables import prune_redundant_variables +from microplex_us.vintages import MP_2024 if TYPE_CHECKING: from microplex.core import SourceProvider @@ -1969,7 +1970,7 @@ def run_policyengine_us_data_rebuild_checkpoint( config_overrides: dict[str, Any] | None = None, providers: tuple[SourceProvider, ...] | list[SourceProvider] | None = None, queries: dict[str, SourceQuery] | None = None, - cps_source_year: int = 2023, + cps_source_year: int = MP_2024.cps_asec.release, cps_cache_dir: str | Path | None = None, cps_download: bool = True, puf_target_year: int | None = None, @@ -1981,9 +1982,9 @@ def run_policyengine_us_data_rebuild_checkpoint( include_donor_surveys: bool = True, include_sipp: bool | None = None, include_scf: bool | None = None, - acs_year: int = 2024, - sipp_year: int = 2023, - scf_year: int = 2022, + acs_year: int = MP_2024.acs.release, + sipp_year: int = MP_2024.sipp.release, + scf_year: int = MP_2024.scf.release, donor_cache_dir: str | Path | None = None, policyengine_us_data_repo: str | Path | None = None, policyengine_us_data_python: str | Path | None = None, @@ -2261,12 +2262,14 @@ def main(argv: list[str] | None = None) -> None: "variables. See docs/next-run-plan.md." ), ) - parser.add_argument("--cps-source-year", type=int, default=2023) + parser.add_argument( + "--cps-source-year", type=int, default=MP_2024.cps_asec.release + ) parser.add_argument("--puf-target-year", type=int) parser.add_argument("--puf-cps-reference-year", type=int) - parser.add_argument("--acs-year", type=int, default=2024) - parser.add_argument("--sipp-year", type=int, default=2023) - parser.add_argument("--scf-year", type=int, default=2022) + parser.add_argument("--acs-year", type=int, default=MP_2024.acs.release) + parser.add_argument("--sipp-year", type=int, default=MP_2024.sipp.release) + parser.add_argument("--scf-year", type=int, default=MP_2024.scf.release) parser.add_argument("--cps-cache-dir") parser.add_argument("--puf-cache-dir") parser.add_argument("--donor-cache-dir") diff --git a/src/microplex_us/vintages.py b/src/microplex_us/vintages.py new file mode 100644 index 0000000..65ef650 --- /dev/null +++ b/src/microplex_us/vintages.py @@ -0,0 +1,165 @@ +"""Single source of truth for the source vintages a built dataset uses. + +A :class:`DatasetProfile` declares, in ONE place, the model year a dataset +represents and the exact source *release* feeding each input, plus how that +release's dollars reach the model year (used natively, or aged with a +component-specific factor family). + +Build code reads vintages from a profile instead of per-call literal defaults, +so a stale year cannot hide in a function signature, a CLI default, or a +forgotten shell flag: the value is defined once and the safe path is the only +path. (The motivating bug: ``cps_source_year`` defaulted to 2023 -- income year +2022 -- while every production build overrode it to 2025; the stale literal sat +in three signatures for who knows how long because nothing failed.) + +The coherence checks here are the spec the build must satisfy: every source must +reach ``model_year`` -- either it is native to that year (``income_year == +model_year``) or it declares an ``age_to == model_year`` aging step. A source +that does not yet reach the model year must declare a ``gap_reason`` so the gap +is explicit rather than silent. A future build-time gate verifies a produced +artifact against the active profile; this module guarantees the *profile itself* +is internally consistent. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Release: + """One source release and how its dollars reach a model year. + + Attributes: + release: the survey/file release actually loaded (e.g. CPS ASEC 2025). + income_year: the calendar/income year that release represents. CPS ASEC + survey year ``Y`` covers income year ``Y - 1`` (ASEC 2025 -> 2024); + most other sources have ``release == income_year``. + age_to: when set, dollar variables are aged from ``income_year`` to this + year using ``factors``; when ``None`` the release is used on its + native basis. + factors: label of the component-specific growth-factor family used when + aging (e.g. ``"soi"``). Required iff ``age_to`` is set; the build + binds the label to the actual factor implementation. + gap_reason: explicit, temporary acknowledgement that this source does not + yet reach the model year (e.g. aging not wired). Lets a profile stay + honest about a known gap without silently passing coherence. + """ + + release: int + income_year: int + age_to: int | None = None + factors: str | None = None + gap_reason: str | None = None + + def __post_init__(self) -> None: + if self.age_to is not None and self.factors is None: + raise ValueError( + f"Release(release={self.release}) sets age_to={self.age_to} but " + "no `factors` family to age with." + ) + if self.age_to is None and self.factors is not None: + raise ValueError( + f"Release(release={self.release}) sets factors={self.factors!r} " + "but no `age_to` year to age toward." + ) + if self.age_to is not None and self.age_to < self.income_year: + raise ValueError( + f"Release(release={self.release}) ages backward: age_to=" + f"{self.age_to} < income_year={self.income_year}." + ) + + @property + def effective_year(self) -> int: + """The model year this release's dollars land on after any aging.""" + return self.age_to if self.age_to is not None else self.income_year + + +@dataclass(frozen=True) +class DatasetProfile: + """The complete vintage definition for one built dataset. + + ``model_year`` is the year the dataset represents; every source must reach it + (or declare a ``gap_reason``). + """ + + name: str + model_year: int + cps_asec: Release + puf: Release + acs: Release + sipp: Release + scf: Release + + def sources(self) -> dict[str, Release]: + return { + "cps_asec": self.cps_asec, + "puf": self.puf, + "acs": self.acs, + "sipp": self.sipp, + "scf": self.scf, + } + + def incoherent_sources(self) -> dict[str, str]: + """Map each source that fails to reach ``model_year`` (and has not + declared a ``gap_reason``) to a human-readable explanation.""" + problems: dict[str, str] = {} + for name, release in self.sources().items(): + if release.gap_reason is not None: + continue + if release.effective_year != self.model_year: + problems[name] = ( + f"reaches {release.effective_year} (release {release.release}, " + f"income {release.income_year}, age_to {release.age_to}); " + f"model_year is {self.model_year}" + ) + return problems + + def declared_gaps(self) -> dict[str, str]: + """Map each source with a declared (acknowledged) basis gap to its reason.""" + return { + name: release.gap_reason + for name, release in self.sources().items() + if release.gap_reason is not None + } + + def __post_init__(self) -> None: + problems = self.incoherent_sources() + if problems: + detail = "; ".join(f"{name}: {why}" for name, why in problems.items()) + raise ValueError( + f"DatasetProfile {self.name!r} is incoherent: every source must " + f"reach model_year {self.model_year} or declare a gap_reason. {detail}" + ) + + +# The current Microplex eCPS-replacement target: a 2024 base dataset that +# replaces ``enhanced_cps_2024``. Source releases match what the production build +# loads today; the aging declarations are the spec the build satisfies (PUF ages +# via SOI factors; SIPP/SCF aging to 2024 landed in #185; ACS donor is now the +# native-2024 release). +MP_2024 = DatasetProfile( + name="mp_2024", + model_year=2024, + # CPS ASEC survey year 2025 == income/calendar year 2024: native 2024 spine. + cps_asec=Release(release=2025, income_year=2024), + # Public-use PUF base is 2015 (latest released); aged to 2024 via SOI factors. + puf=Release(release=2015, income_year=2015, age_to=2024, factors="soi"), + # ACS donor at the native-2024 release. + acs=Release(release=2024, income_year=2024), + # SIPP/SCF donors aged from their latest releases to 2024. + sipp=Release(release=2023, income_year=2023, age_to=2024, factors="pe_growfactors"), + scf=Release(release=2022, income_year=2022, age_to=2024, factors="pe_growfactors"), +) + + +PROFILES: dict[str, DatasetProfile] = {MP_2024.name: MP_2024} + + +def get_profile(name: str) -> DatasetProfile: + """Return the named dataset profile, or raise with the known names.""" + try: + return PROFILES[name] + except KeyError: + known = ", ".join(sorted(PROFILES)) + raise KeyError(f"Unknown dataset profile {name!r}; known profiles: {known}") diff --git a/tests/pipelines/test_pe_us_data_rebuild.py b/tests/pipelines/test_pe_us_data_rebuild.py index aacfe69..11eee6e 100644 --- a/tests/pipelines/test_pe_us_data_rebuild.py +++ b/tests/pipelines/test_pe_us_data_rebuild.py @@ -203,3 +203,22 @@ def test_build_policyengine_us_data_rebuild_pipeline_returns_configured_pipeline assert pipeline.config.calibration_max_iter == 77 assert pipeline.config.synthesis_backend == "seed" assert pipeline.config.calibration_backend == "entropy" + + +def test_source_provider_year_defaults_derive_from_mp_2024_profile() -> None: + # Year defaults come from the single-source-of-truth vintage profile, so a + # stale literal cannot silently return. The CPS spine default is the profile's + # ASEC release (2025 = income year 2024), not the 2023 that used to require a + # CLI override to be correct. + import inspect + + from microplex_us.vintages import MP_2024 + + params = inspect.signature( + default_policyengine_us_data_rebuild_source_providers + ).parameters + assert params["cps_source_year"].default == MP_2024.cps_asec.release == 2025 + assert params["acs_year"].default == MP_2024.acs.release + assert params["sipp_year"].default == MP_2024.sipp.release + assert params["scf_year"].default == MP_2024.scf.release + assert params["puf_target_year"].default == MP_2024.model_year diff --git a/tests/test_vintages.py b/tests/test_vintages.py new file mode 100644 index 0000000..0cac090 --- /dev/null +++ b/tests/test_vintages.py @@ -0,0 +1,103 @@ +"""Tests for the single-source-of-truth dataset vintage profiles.""" + +import pytest + +from microplex_us.vintages import ( + MP_2024, + DatasetProfile, + Release, + get_profile, +) + + +def _release(**overrides) -> Release: + base = dict(release=2024, income_year=2024) + base.update(overrides) + return Release(**base) + + +def _profile(**overrides) -> DatasetProfile: + base = dict( + name="test", + model_year=2024, + cps_asec=Release(release=2025, income_year=2024), + puf=Release(release=2015, income_year=2015, age_to=2024, factors="soi"), + acs=Release(release=2024, income_year=2024), + sipp=Release(release=2023, income_year=2023, age_to=2024, factors="g"), + scf=Release(release=2022, income_year=2022, age_to=2024, factors="g"), + ) + base.update(overrides) + return DatasetProfile(**base) + + +# --- Release --------------------------------------------------------------- + + +def test_release_effective_year_native_vs_aged(): + assert Release(release=2025, income_year=2024).effective_year == 2024 + assert ( + Release(release=2015, income_year=2015, age_to=2024, factors="soi").effective_year + == 2024 + ) + + +def test_release_age_to_requires_factors(): + with pytest.raises(ValueError, match="factors"): + Release(release=2015, income_year=2015, age_to=2024) + + +def test_release_factors_requires_age_to(): + with pytest.raises(ValueError, match="age_to"): + Release(release=2015, income_year=2015, factors="soi") + + +def test_release_cannot_age_backward(): + with pytest.raises(ValueError, match="ages backward"): + Release(release=2015, income_year=2024, age_to=2022, factors="soi") + + +# --- DatasetProfile coherence --------------------------------------------- + + +def test_mp_2024_is_coherent_and_has_no_gaps(): + assert MP_2024.model_year == 2024 + assert MP_2024.incoherent_sources() == {} + assert MP_2024.declared_gaps() == {} + # Every source's dollars land on the model year. + for name, release in MP_2024.sources().items(): + assert release.effective_year == 2024, name + + +def test_mp_2024_cps_spine_is_native_2024_income(): + # ASEC survey year 2025 -> income year 2024; the spine is native, not aged. + assert MP_2024.cps_asec.release == 2025 + assert MP_2024.cps_asec.income_year == 2024 + assert MP_2024.cps_asec.age_to is None + + +def test_incoherent_profile_raises(): + # A donor stuck at 2022 with no aging and no acknowledged gap is incoherent. + with pytest.raises(ValueError, match="incoherent"): + _profile(scf=Release(release=2022, income_year=2022)) + + +def test_declared_gap_passes_coherence_but_is_surfaced(): + profile = _profile( + scf=Release(release=2022, income_year=2022, gap_reason="aging not wired yet") + ) + assert profile.incoherent_sources() == {} + assert profile.declared_gaps() == {"scf": "aging not wired yet"} + + +def test_aged_source_reaching_wrong_year_is_incoherent(): + with pytest.raises(ValueError, match="incoherent"): + _profile(sipp=Release(release=2023, income_year=2023, age_to=2023, factors="g")) + + +# --- registry -------------------------------------------------------------- + + +def test_get_profile_returns_known_and_raises_unknown(): + assert get_profile("mp_2024") is MP_2024 + with pytest.raises(KeyError, match="Unknown dataset profile"): + get_profile("mp_1999") From 0e8f3e3f127aec935c7bb280f67a7008dd1f8ac0 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 2 Jun 2026 17:30:57 -0400 Subject: [PATCH 2/5] Fix cycle review findings: correct ACS vintage + tie profile to manifest Independent review found MP_2024.acs declared release 2024 while the ACS donor loader is pinned to ACS_2022 (manifest default_year=2022) and is excluded from TARGET_YEAR_UPRATED_SURVEYS (never aged). The real ACS vintage is 2022; the provider default had silently drifted to 2024 vs what every build script (--acs-year 2022), the manifest, and the gate1 build log actually load. The profile enshrining 2024 defeated its own purpose. - Correct MP_2024.acs to release 2022 with a declared gap_reason. The acs_year default now resolves to 2022 (matching the loader/scripts), so the build no longer needs to override it; the gap_reason flags that an ACS-2024 move is a loader migration, not silently assumed done. - Add a manifest-tie test asserting each donor release equals the pe_source_ impute manifest default_year -- catches exactly this profile-vs-loader drift. - Extend the default-derivation regression guard to the checkpoint signature, not just the provider. - Update the coherence test for the declared ACS gap; harden Release (reject an empty gap_reason) and get_profile (chain-free KeyError). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/microplex_us/vintages.py | 24 ++++++++++++++++--- tests/pipelines/test_pe_us_data_rebuild.py | 19 ++++++++++++++- tests/test_vintages.py | 27 +++++++++++++++++++--- 3 files changed, 63 insertions(+), 7 deletions(-) diff --git a/src/microplex_us/vintages.py b/src/microplex_us/vintages.py index 65ef650..04f6245 100644 --- a/src/microplex_us/vintages.py +++ b/src/microplex_us/vintages.py @@ -68,6 +68,11 @@ def __post_init__(self) -> None: f"Release(release={self.release}) ages backward: age_to=" f"{self.age_to} < income_year={self.income_year}." ) + if self.gap_reason is not None and not self.gap_reason.strip(): + raise ValueError( + f"Release(release={self.release}) has an empty gap_reason; use None " + "for no declared gap or give a real explanation." + ) @property def effective_year(self) -> int: @@ -145,8 +150,19 @@ def __post_init__(self) -> None: cps_asec=Release(release=2025, income_year=2024), # Public-use PUF base is 2015 (latest released); aged to 2024 via SOI factors. puf=Release(release=2015, income_year=2015, age_to=2024, factors="soi"), - # ACS donor at the native-2024 release. - acs=Release(release=2024, income_year=2024), + # ACS donor is pinned to the 2022 release (manifest block ACS_2022, + # default_year=2022) and is not in TARGET_YEAR_UPRATED_SURVEYS, so it is not + # aged to the model year. The provider default had drifted to 2024 while the + # loader stayed at 2022; declare the real release and flag the gap so the + # ACS-2024 migration is explicit rather than silently assumed done. + acs=Release( + release=2022, + income_year=2022, + gap_reason=( + "ACS donor loads the 2022 release (manifest ACS_2022) and is not aged " + "to the model year; reconcile when ACS moves to the 2024 release." + ), + ), # SIPP/SCF donors aged from their latest releases to 2024. sipp=Release(release=2023, income_year=2023, age_to=2024, factors="pe_growfactors"), scf=Release(release=2022, income_year=2022, age_to=2024, factors="pe_growfactors"), @@ -162,4 +178,6 @@ def get_profile(name: str) -> DatasetProfile: return PROFILES[name] except KeyError: known = ", ".join(sorted(PROFILES)) - raise KeyError(f"Unknown dataset profile {name!r}; known profiles: {known}") + raise KeyError( + f"Unknown dataset profile {name!r}; known profiles: {known}" + ) from None diff --git a/tests/pipelines/test_pe_us_data_rebuild.py b/tests/pipelines/test_pe_us_data_rebuild.py index 11eee6e..c6be713 100644 --- a/tests/pipelines/test_pe_us_data_rebuild.py +++ b/tests/pipelines/test_pe_us_data_rebuild.py @@ -138,7 +138,10 @@ def test_default_policyengine_us_data_rebuild_source_providers_use_pe_style_bund SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF ) assert isinstance(providers[2], ACSSourceProvider) - assert providers[2].year == 2024 + # ACS is pinned to its 2022 release (manifest ACS_2022, not aged); the default + # derives from MP_2024.acs.release. The prior 2024 here disagreed with the + # loader -- see the declared ACS gap in the vintage profile. + assert providers[2].year == 2022 assert isinstance(providers[3], SIPPSourceProvider) assert providers[3].block == "tips" assert providers[3].target_year == 2024 @@ -222,3 +225,17 @@ def test_source_provider_year_defaults_derive_from_mp_2024_profile() -> None: assert params["sipp_year"].default == MP_2024.sipp.release assert params["scf_year"].default == MP_2024.scf.release assert params["puf_target_year"].default == MP_2024.model_year + + # The checkpoint signature is the second of the three sites the stale literal + # used to live in; guard it too so a revert there cannot pass silently. + from microplex_us.pipelines.pe_us_data_rebuild_checkpoint import ( + run_policyengine_us_data_rebuild_checkpoint, + ) + + checkpoint_params = inspect.signature( + run_policyengine_us_data_rebuild_checkpoint + ).parameters + assert checkpoint_params["cps_source_year"].default == MP_2024.cps_asec.release + assert checkpoint_params["acs_year"].default == MP_2024.acs.release + assert checkpoint_params["sipp_year"].default == MP_2024.sipp.release + assert checkpoint_params["scf_year"].default == MP_2024.scf.release diff --git a/tests/test_vintages.py b/tests/test_vintages.py index 0cac090..178bb8d 100644 --- a/tests/test_vintages.py +++ b/tests/test_vintages.py @@ -59,15 +59,36 @@ def test_release_cannot_age_backward(): # --- DatasetProfile coherence --------------------------------------------- -def test_mp_2024_is_coherent_and_has_no_gaps(): +def test_mp_2024_is_coherent_with_only_the_acs_gap_declared(): assert MP_2024.model_year == 2024 + # No *undeclared* incoherence. The ACS donor is an explicitly declared gap + # (pinned to its 2022 release, not yet aged to the model year). assert MP_2024.incoherent_sources() == {} - assert MP_2024.declared_gaps() == {} - # Every source's dollars land on the model year. + assert set(MP_2024.declared_gaps()) == {"acs"} + # Every non-gapped source's dollars land on the model year. for name, release in MP_2024.sources().items(): + if name in MP_2024.declared_gaps(): + continue assert release.effective_year == 2024, name +def test_release_rejects_empty_gap_reason(): + with pytest.raises(ValueError, match="empty gap_reason"): + Release(release=2022, income_year=2022, gap_reason=" ") + + +def test_mp_2024_donor_releases_match_source_manifest(): + # The profile's donor release years must equal what the build's donor loaders + # actually use (the pe_source_impute manifest ``default_year``), or the profile + # asserts a vintage the build does not load. This guard catches exactly the ACS + # 2024-vs-ACS_2022 mismatch the profile is meant to surface. + from microplex_us.pe_source_impute_specs import get_pe_source_impute_block_spec + + for source, block in [("acs", "acs"), ("sipp", "sipp_tips"), ("scf", "scf")]: + spec = get_pe_source_impute_block_spec(block) + assert MP_2024.sources()[source].release == spec.default_year, source + + def test_mp_2024_cps_spine_is_native_2024_income(): # ASEC survey year 2025 -> income year 2024; the spine is native, not aged. assert MP_2024.cps_asec.release == 2025 From 8f76337b8d144fc6a59f2b2fdd22c6baabdc5796 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 2 Jun 2026 21:19:14 -0400 Subject: [PATCH 3/5] Reshape vintage profile: key on (dataset, year), derive names; ACS -> 2024 Toward making the source year a single key rather than a value smeared across the provider/checkpoint/CLI/scripts/names: - Profiles are addressed by (dataset, model_year): DatasetProfile.key and .name (mp_ecps_2024), plus resolve_profile(dataset, year). - version_id(variant, commit, build_date) derives the canonical build name from the profile, so the asec{cps}-calendar{model} years in the name cannot drift from the data (e.g. mp-ecps-shaped-asec2025-calendar2024-...). Names become an output of the profile, never hand-typed. - source_years() exposes the per-source years from one place, so callers thread a profile instead of five loose year args. - Correct MP_2024.acs to the native 2024 release (codex #184 default + the ACS-2024 donor H5 fallback). Drops the earlier 2022 gap that wrongly anchored on the stale manifest/scripts. ACS is excluded from the manifest-tie because MP loads a local acs_2024.h5 beyond the module's ACS_2022 baseline. Next (same PR or follow-up): thread `--profile` / source_years() through the CLI and build scripts, derive the artifact version_id from the profile, and retire the per-year args and `--*-year` flags. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/microplex_us/vintages.py | 64 ++++++++++++++++------ tests/pipelines/test_pe_us_data_rebuild.py | 7 +-- tests/test_vintages.py | 57 ++++++++++++++----- 3 files changed, 93 insertions(+), 35 deletions(-) diff --git a/src/microplex_us/vintages.py b/src/microplex_us/vintages.py index 04f6245..6690ac8 100644 --- a/src/microplex_us/vintages.py +++ b/src/microplex_us/vintages.py @@ -88,7 +88,7 @@ class DatasetProfile: (or declare a ``gap_reason``). """ - name: str + dataset: str model_year: int cps_asec: Release puf: Release @@ -96,6 +96,16 @@ class DatasetProfile: sipp: Release scf: Release + @property + def name(self) -> str: + """Stable profile name ``{dataset}_{model_year}`` (e.g. ``mp_ecps_2024``).""" + return f"{self.dataset}_{self.model_year}" + + @property + def key(self) -> tuple[str, int]: + """The ``(dataset, model_year)`` identity a build is addressed by.""" + return (self.dataset, self.model_year) + def sources(self) -> dict[str, Release]: return { "cps_asec": self.cps_asec, @@ -105,6 +115,28 @@ def sources(self) -> dict[str, Release]: "scf": self.scf, } + def source_years(self) -> dict[str, int]: + """The release/target years the source providers consume, derived once from + this profile so callers pass the profile rather than loose per-source years. + Keyed by the existing provider/checkpoint parameter names.""" + return { + "cps_source_year": self.cps_asec.release, + "puf_target_year": self.model_year, + "acs_year": self.acs.release, + "sipp_year": self.sipp.release, + "scf_year": self.scf.release, + } + + def version_id(self, *, variant: str, commit: str, build_date: str) -> str: + """Derive the canonical build/version id from the profile so the ASEC and + calendar years in the name cannot disagree with the data. Example: + ``mp-ecps-shaped-asec2025-calendar2024---``.""" + prefix = self.dataset.replace("_", "-") + return ( + f"{prefix}-shaped-asec{self.cps_asec.release}-" + f"calendar{self.model_year}-{variant}-{commit}-{build_date}" + ) + def incoherent_sources(self) -> dict[str, str]: """Map each source that fails to reach ``model_year`` (and has not declared a ``gap_reason``) to a human-readable explanation.""" @@ -144,25 +176,19 @@ def __post_init__(self) -> None: # via SOI factors; SIPP/SCF aging to 2024 landed in #185; ACS donor is now the # native-2024 release). MP_2024 = DatasetProfile( - name="mp_2024", + dataset="mp_ecps", model_year=2024, # CPS ASEC survey year 2025 == income/calendar year 2024: native 2024 spine. cps_asec=Release(release=2025, income_year=2024), # Public-use PUF base is 2015 (latest released); aged to 2024 via SOI factors. puf=Release(release=2015, income_year=2015, age_to=2024, factors="soi"), - # ACS donor is pinned to the 2022 release (manifest block ACS_2022, - # default_year=2022) and is not in TARGET_YEAR_UPRATED_SURVEYS, so it is not - # aged to the model year. The provider default had drifted to 2024 while the - # loader stayed at 2022; declare the real release and flag the gap so the - # ACS-2024 migration is explicit rather than silently assumed done. - acs=Release( - release=2022, - income_year=2022, - gap_reason=( - "ACS donor loads the 2022 release (manifest ACS_2022) and is not aged " - "to the model year; reconcile when ACS moves to the 2024 release." - ), - ), + # ACS donor uses the native 2024 release (income year 2024 == model year, + # no aging). The PE-US-data Python module still only exposes ACS_2022, so MP + # loads a local acs_2024.h5 via the donor-provider H5 fallback added in #184; + # the provider default is 2024. The manifest block's default_year=2022 is the + # module baseline, not MP's intended vintage, so ACS is excluded from the + # manifest-tie check below. + acs=Release(release=2024, income_year=2024), # SIPP/SCF donors aged from their latest releases to 2024. sipp=Release(release=2023, income_year=2023, age_to=2024, factors="pe_growfactors"), scf=Release(release=2022, income_year=2022, age_to=2024, factors="pe_growfactors"), @@ -173,7 +199,7 @@ def __post_init__(self) -> None: def get_profile(name: str) -> DatasetProfile: - """Return the named dataset profile, or raise with the known names.""" + """Return the dataset profile by name (``{dataset}_{model_year}``).""" try: return PROFILES[name] except KeyError: @@ -181,3 +207,9 @@ def get_profile(name: str) -> DatasetProfile: raise KeyError( f"Unknown dataset profile {name!r}; known profiles: {known}" ) from None + + +def resolve_profile(dataset: str, model_year: int) -> DatasetProfile: + """Resolve a build's profile from its ``(dataset, model_year)`` key -- the + single identifier a build is keyed on.""" + return get_profile(f"{dataset}_{model_year}") diff --git a/tests/pipelines/test_pe_us_data_rebuild.py b/tests/pipelines/test_pe_us_data_rebuild.py index c6be713..8bc91e1 100644 --- a/tests/pipelines/test_pe_us_data_rebuild.py +++ b/tests/pipelines/test_pe_us_data_rebuild.py @@ -138,10 +138,9 @@ def test_default_policyengine_us_data_rebuild_source_providers_use_pe_style_bund SOCIAL_SECURITY_SPLIT_STRATEGY_PE_QRF ) assert isinstance(providers[2], ACSSourceProvider) - # ACS is pinned to its 2022 release (manifest ACS_2022, not aged); the default - # derives from MP_2024.acs.release. The prior 2024 here disagreed with the - # loader -- see the declared ACS gap in the vintage profile. - assert providers[2].year == 2022 + # ACS uses the native 2024 release (MP_2024.acs.release); MP loads acs_2024.h5 + # via the #184 donor H5 fallback. + assert providers[2].year == 2024 assert isinstance(providers[3], SIPPSourceProvider) assert providers[3].block == "tips" assert providers[3].target_year == 2024 diff --git a/tests/test_vintages.py b/tests/test_vintages.py index 178bb8d..7a67d90 100644 --- a/tests/test_vintages.py +++ b/tests/test_vintages.py @@ -7,6 +7,7 @@ DatasetProfile, Release, get_profile, + resolve_profile, ) @@ -18,7 +19,7 @@ def _release(**overrides) -> Release: def _profile(**overrides) -> DatasetProfile: base = dict( - name="test", + dataset="test", model_year=2024, cps_asec=Release(release=2025, income_year=2024), puf=Release(release=2015, income_year=2015, age_to=2024, factors="soi"), @@ -59,16 +60,12 @@ def test_release_cannot_age_backward(): # --- DatasetProfile coherence --------------------------------------------- -def test_mp_2024_is_coherent_with_only_the_acs_gap_declared(): +def test_mp_2024_is_coherent_and_has_no_gaps(): assert MP_2024.model_year == 2024 - # No *undeclared* incoherence. The ACS donor is an explicitly declared gap - # (pinned to its 2022 release, not yet aged to the model year). assert MP_2024.incoherent_sources() == {} - assert set(MP_2024.declared_gaps()) == {"acs"} - # Every non-gapped source's dollars land on the model year. + assert MP_2024.declared_gaps() == {} + # Every source's dollars land on the model year. for name, release in MP_2024.sources().items(): - if name in MP_2024.declared_gaps(): - continue assert release.effective_year == 2024, name @@ -78,13 +75,14 @@ def test_release_rejects_empty_gap_reason(): def test_mp_2024_donor_releases_match_source_manifest(): - # The profile's donor release years must equal what the build's donor loaders - # actually use (the pe_source_impute manifest ``default_year``), or the profile - # asserts a vintage the build does not load. This guard catches exactly the ACS - # 2024-vs-ACS_2022 mismatch the profile is meant to surface. + # SIPP/SCF donor release years must equal what the build's donor loaders use + # (the pe_source_impute manifest ``default_year``), or the profile asserts a + # vintage the build does not load. ACS is intentionally excluded: MP loads a + # newer local acs_2024.h5 via the #184 donor H5 fallback, beyond the module's + # ACS_2022 baseline default_year, so the manifest is not authoritative for ACS. from microplex_us.pe_source_impute_specs import get_pe_source_impute_block_spec - for source, block in [("acs", "acs"), ("sipp", "sipp_tips"), ("scf", "scf")]: + for source, block in [("sipp", "sipp_tips"), ("scf", "scf")]: spec = get_pe_source_impute_block_spec(block) assert MP_2024.sources()[source].release == spec.default_year, source @@ -119,6 +117,35 @@ def test_aged_source_reaching_wrong_year_is_incoherent(): def test_get_profile_returns_known_and_raises_unknown(): - assert get_profile("mp_2024") is MP_2024 + assert get_profile("mp_ecps_2024") is MP_2024 with pytest.raises(KeyError, match="Unknown dataset profile"): - get_profile("mp_1999") + get_profile("mp_ecps_1999") + + +def test_profile_is_keyed_on_dataset_and_year(): + assert MP_2024.dataset == "mp_ecps" + assert MP_2024.model_year == 2024 + assert MP_2024.name == "mp_ecps_2024" + assert MP_2024.key == ("mp_ecps", 2024) + assert resolve_profile("mp_ecps", 2024) is MP_2024 + + +def test_source_years_derive_from_the_profile(): + # The years a build threads through its providers come from one place. + assert MP_2024.source_years() == { + "cps_source_year": 2025, + "puf_target_year": 2024, + "acs_year": 2024, + "sipp_year": 2023, + "scf_year": 2022, + } + + +def test_version_id_is_derived_so_name_years_cannot_drift(): + vid = MP_2024.version_id( + variant="puf-support-clone", commit="abc1234", build_date="20260602" + ) + # ASEC + calendar years are pulled from the profile, not typed into the name. + assert vid == ( + "mp-ecps-shaped-asec2025-calendar2024-puf-support-clone-abc1234-20260602" + ) From f628ab8ecce225332e81967610febae02fefe91f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 2 Jun 2026 21:40:23 -0400 Subject: [PATCH 4/5] Thread the profile through provider/checkpoint/CLI; retire per-year flags A build is now keyed on a dataset profile -- the single (dataset, year) key -- instead of five loose year arguments smeared across the call stack: - default_policyengine_us_data_rebuild_source_providers and run_policyengine_us_data_rebuild_checkpoint take `profile` (default MP_2024). The per-source *_year arguments become None-defaulting overrides that resolve from profile.source_years(), so there are no literal year defaults left to drift from the profile. - The CLI takes `--profile mp_ecps_2024`; the per-source --cps-source-year / --puf-target-year / --acs-year / --sipp-year / --scf-year flags are removed and the checkpoint resolves the profile via get_profile. - The regression guard now verifies the year params default to None and that the resolved providers carry the profile's years. The source years (and, via version_id(), the build name) derive from that one key. Build scripts pass --profile and derive the version-id from the profile (codex follow-up; those scripts are not tracked in this PR). 132 tests pass across vintages/rebuild/checkpoint/us/cps/donor; ruff clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../pipelines/pe_us_data_rebuild.py | 32 ++++++++++--- .../pe_us_data_rebuild_checkpoint.py | 32 ++++++------- tests/pipelines/test_pe_us_data_rebuild.py | 47 ++++++++++--------- 3 files changed, 65 insertions(+), 46 deletions(-) diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild.py b/src/microplex_us/pipelines/pe_us_data_rebuild.py index d89645e..7f9b036 100644 --- a/src/microplex_us/pipelines/pe_us_data_rebuild.py +++ b/src/microplex_us/pipelines/pe_us_data_rebuild.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from microplex_us.vintages import MP_2024 +from microplex_us.vintages import MP_2024, DatasetProfile if TYPE_CHECKING: from microplex.core import SourceProvider @@ -97,10 +97,11 @@ def default_policyengine_us_data_rebuild_config( def default_policyengine_us_data_rebuild_source_providers( *, - cps_source_year: int = MP_2024.cps_asec.release, + profile: DatasetProfile = MP_2024, + cps_source_year: int | None = None, cps_cache_dir: str | Path | None = None, cps_download: bool = True, - puf_target_year: int = MP_2024.model_year, + puf_target_year: int | None = None, puf_cps_reference_year: int | None = None, puf_cache_dir: str | Path | None = None, puf_path: str | Path | None = None, @@ -109,14 +110,31 @@ def default_policyengine_us_data_rebuild_source_providers( include_donor_surveys: bool = True, include_sipp: bool | None = None, include_scf: bool | None = None, - acs_year: int = MP_2024.acs.release, - sipp_year: int = MP_2024.sipp.release, - scf_year: int = MP_2024.scf.release, + acs_year: int | None = None, + sipp_year: int | None = None, + scf_year: int | None = None, donor_cache_dir: str | Path | None = None, policyengine_us_data_repo: str | Path | None = None, policyengine_us_data_python: str | Path | None = None, ) -> tuple[SourceProvider, ...]: - """Return the canonical CPS+PUF provider bundle for the rebuild track.""" + """Return the canonical CPS+PUF provider bundle for one dataset ``profile``. + + Source years derive from ``profile`` -- a single ``(dataset, model_year)`` + key. The per-source ``*_year`` arguments remain only as explicit overrides + (``None`` means "take it from the profile"), so there are no stale literal + defaults to drift from the profile. + """ + + _years = profile.source_years() + cps_source_year = ( + _years["cps_source_year"] if cps_source_year is None else cps_source_year + ) + puf_target_year = ( + _years["puf_target_year"] if puf_target_year is None else puf_target_year + ) + acs_year = _years["acs_year"] if acs_year is None else acs_year + sipp_year = _years["sipp_year"] if sipp_year is None else sipp_year + scf_year = _years["scf_year"] if scf_year is None else scf_year from microplex_us.data_sources.cps import CPSASECSourceProvider from microplex_us.data_sources.donor_surveys import ( diff --git a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py index 0fa0d89..b910a7c 100644 --- a/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py +++ b/src/microplex_us/pipelines/pe_us_data_rebuild_checkpoint.py @@ -64,7 +64,7 @@ write_us_stage_run_manifests_from_artifact_manifest, ) from microplex_us.variables import prune_redundant_variables -from microplex_us.vintages import MP_2024 +from microplex_us.vintages import MP_2024, DatasetProfile, get_profile if TYPE_CHECKING: from microplex.core import SourceProvider @@ -1970,7 +1970,8 @@ def run_policyengine_us_data_rebuild_checkpoint( config_overrides: dict[str, Any] | None = None, providers: tuple[SourceProvider, ...] | list[SourceProvider] | None = None, queries: dict[str, SourceQuery] | None = None, - cps_source_year: int = MP_2024.cps_asec.release, + profile: DatasetProfile = MP_2024, + cps_source_year: int | None = None, cps_cache_dir: str | Path | None = None, cps_download: bool = True, puf_target_year: int | None = None, @@ -1982,9 +1983,9 @@ def run_policyengine_us_data_rebuild_checkpoint( include_donor_surveys: bool = True, include_sipp: bool | None = None, include_scf: bool | None = None, - acs_year: int = MP_2024.acs.release, - sipp_year: int = MP_2024.sipp.release, - scf_year: int = MP_2024.scf.release, + acs_year: int | None = None, + sipp_year: int | None = None, + scf_year: int | None = None, donor_cache_dir: str | Path | None = None, policyengine_us_data_repo: str | Path | None = None, policyengine_us_data_python: str | Path | None = None, @@ -2052,6 +2053,7 @@ def run_policyengine_us_data_rebuild_checkpoint( if providers is None: resolved_providers = tuple( default_policyengine_us_data_rebuild_source_providers( + profile=profile, cps_source_year=cps_source_year, cps_cache_dir=cps_cache_dir, cps_download=cps_download, @@ -2263,13 +2265,14 @@ def main(argv: list[str] | None = None) -> None: ), ) parser.add_argument( - "--cps-source-year", type=int, default=MP_2024.cps_asec.release + "--profile", + default=MP_2024.name, + help=( + "Dataset vintage profile name (e.g. mp_ecps_2024). Source release " + "years come from this single (dataset, year) key; the per-source " + "--*-year flags were removed in favor of it." + ), ) - parser.add_argument("--puf-target-year", type=int) - parser.add_argument("--puf-cps-reference-year", type=int) - parser.add_argument("--acs-year", type=int, default=MP_2024.acs.release) - parser.add_argument("--sipp-year", type=int, default=MP_2024.sipp.release) - parser.add_argument("--scf-year", type=int, default=MP_2024.scf.release) parser.add_argument("--cps-cache-dir") parser.add_argument("--puf-cache-dir") parser.add_argument("--donor-cache-dir") @@ -2470,11 +2473,9 @@ def main(argv: list[str] | None = None) -> None: calibration_target_domains=tuple(args.calibration_target_domain), calibration_target_geo_levels=tuple(args.calibration_target_geo_level), config_overrides=config_overrides, - cps_source_year=args.cps_source_year, + profile=get_profile(args.profile), cps_cache_dir=args.cps_cache_dir, cps_download=not args.no_cps_download, - puf_target_year=args.puf_target_year, - puf_cps_reference_year=args.puf_cps_reference_year, puf_cache_dir=args.puf_cache_dir, puf_path=args.puf_path, puf_demographics_path=args.puf_demographics_path, @@ -2482,9 +2483,6 @@ def main(argv: list[str] | None = None) -> None: include_donor_surveys=args.include_donor_surveys, include_sipp=args.include_sipp, include_scf=args.include_scf, - acs_year=args.acs_year, - sipp_year=args.sipp_year, - scf_year=args.scf_year, donor_cache_dir=args.donor_cache_dir, policyengine_us_data_repo=args.policyengine_us_data_repo, policyengine_us_data_python=args.policyengine_us_data_python, diff --git a/tests/pipelines/test_pe_us_data_rebuild.py b/tests/pipelines/test_pe_us_data_rebuild.py index 8bc91e1..71e8940 100644 --- a/tests/pipelines/test_pe_us_data_rebuild.py +++ b/tests/pipelines/test_pe_us_data_rebuild.py @@ -207,34 +207,37 @@ def test_build_policyengine_us_data_rebuild_pipeline_returns_configured_pipeline assert pipeline.config.calibration_backend == "entropy" -def test_source_provider_year_defaults_derive_from_mp_2024_profile() -> None: - # Year defaults come from the single-source-of-truth vintage profile, so a - # stale literal cannot silently return. The CPS spine default is the profile's - # ASEC release (2025 = income year 2024), not the 2023 that used to require a - # CLI override to be correct. +def test_source_provider_years_resolve_from_profile_not_literals() -> None: + # The year params no longer carry literal defaults: they default to None and + # resolve from the dataset profile, so a stale literal cannot silently return. + # Both the provider and the checkpoint take a `profile` and None-default years. import inspect - from microplex_us.vintages import MP_2024 - - params = inspect.signature( - default_policyengine_us_data_rebuild_source_providers - ).parameters - assert params["cps_source_year"].default == MP_2024.cps_asec.release == 2025 - assert params["acs_year"].default == MP_2024.acs.release - assert params["sipp_year"].default == MP_2024.sipp.release - assert params["scf_year"].default == MP_2024.scf.release - assert params["puf_target_year"].default == MP_2024.model_year - - # The checkpoint signature is the second of the three sites the stale literal - # used to live in; guard it too so a revert there cannot pass silently. from microplex_us.pipelines.pe_us_data_rebuild_checkpoint import ( run_policyengine_us_data_rebuild_checkpoint, ) + from microplex_us.vintages import MP_2024 + provider_params = inspect.signature( + default_policyengine_us_data_rebuild_source_providers + ).parameters checkpoint_params = inspect.signature( run_policyengine_us_data_rebuild_checkpoint ).parameters - assert checkpoint_params["cps_source_year"].default == MP_2024.cps_asec.release - assert checkpoint_params["acs_year"].default == MP_2024.acs.release - assert checkpoint_params["sipp_year"].default == MP_2024.sipp.release - assert checkpoint_params["scf_year"].default == MP_2024.scf.release + for params in (provider_params, checkpoint_params): + assert "profile" in params + for year in ( + "cps_source_year", + "puf_target_year", + "acs_year", + "sipp_year", + "scf_year", + ): + assert params[year].default is None, year + + # With the default profile, the resolved providers carry MP_2024's years. + providers = default_policyengine_us_data_rebuild_source_providers(cps_download=False) + years = MP_2024.source_years() + assert providers[0].year == years["cps_source_year"] == 2025 + assert providers[1].target_year == years["puf_target_year"] == 2024 + assert providers[2].year == years["acs_year"] == 2024 From 1ad1f3f569ffcbb4cdc7ccde4f9005f2930ec3ad Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 2 Jun 2026 22:11:04 -0400 Subject: [PATCH 5/5] Add CLI --profile coverage from cycle review Cover the new CLI surface the review flagged as untested: assert that --profile resolves through the vintage registry and threads the resolved profile onto the checkpoint call, and that an unknown profile name fails loudly (KeyError) rather than silently building the wrong dataset. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../test_pe_us_data_rebuild_checkpoint.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/pipelines/test_pe_us_data_rebuild_checkpoint.py b/tests/pipelines/test_pe_us_data_rebuild_checkpoint.py index a78cee0..2f17cda 100644 --- a/tests/pipelines/test_pe_us_data_rebuild_checkpoint.py +++ b/tests/pipelines/test_pe_us_data_rebuild_checkpoint.py @@ -706,6 +706,8 @@ def fake_run_policyengine_us_data_rebuild_checkpoint(**kwargs): "/tmp/policy_data.db", "--version-id", "run-1", + "--profile", + "mp_ecps_2024", "--donor-imputer-condition-selection", "pe_plus_puf_native_challenger", "--defer-native-audit", @@ -720,11 +722,37 @@ def fake_run_policyengine_us_data_rebuild_checkpoint(**kwargs): assert captured["config_overrides"]["random_seed"] == 42 assert captured["defer_native_audit"] is True assert captured["defer_imputation_ablation"] is True + # The CLI resolves --profile through the vintage registry and threads the + # resolved profile to the checkpoint (the per-year flags no longer exist). + assert captured["profile"].name == "mp_ecps_2024" stdout = capsys.readouterr().out assert "/tmp/artifacts/run-1" in stdout assert "hasRealPolicyEngineComparison" in stdout +def test_main_rejects_unknown_profile() -> None: + # --profile is resolved through the vintage registry; an unknown name fails + # loudly rather than silently building the wrong dataset. get_profile raises + # before the checkpoint runs, so no checkpoint mock is needed. + try: + checkpoint_module.main( + [ + "--output-root", + "/tmp/artifacts", + "--baseline-dataset", + "/tmp/enhanced_cps_2024.h5", + "--targets-db", + "/tmp/policy_data.db", + "--profile", + "mp_ecps_1999", + ] + ) + except KeyError as exc: + assert "Unknown dataset profile" in str(exc) + else: + raise AssertionError("Expected unknown --profile to fail closed") + + def test_run_policyengine_us_data_rebuild_checkpoint_rejects_empty_provider_sequence( tmp_path, ) -> None: