From 8e63dbdae217c4990c7b7e6e17bcf04afee757fe Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Tue, 30 Jun 2026 21:59:50 -0400 Subject: [PATCH] ci(fleet): guard example-suite tooling pins against floor drift Each cascade-example repo bootstraps a cascade CLI in its scenario suite, pinned by hand to a fixed release. Nothing kept that pin moving forward, so a suite could sit on a release that predates a command it now invokes and fail a live fleet lane with a cryptic unknown-command error mid-fan-out. Add a shared check that fails when a suite's setup-cli pin is below the latest stable cascade release (the feature floor). Run it two ways: a daily Suite Tooling Floor workflow that surfaces drift off the release cadence, and a floor-check gate in fleet-e2e that reds an rc run before any repin or dispatch. A pin at or above the floor passes; a suite tracking a moving ref is not flagged. Signed-off-by: Joshua Temple --- .github/scripts/check-suite-tooling-floor.sh | 118 +++++++++++++++++++ .github/workflows/fleet-e2e.yaml | 42 ++++++- .github/workflows/suite-tooling-floor.yaml | 50 ++++++++ 3 files changed, 205 insertions(+), 5 deletions(-) create mode 100755 .github/scripts/check-suite-tooling-floor.sh create mode 100644 .github/workflows/suite-tooling-floor.yaml diff --git a/.github/scripts/check-suite-tooling-floor.sh b/.github/scripts/check-suite-tooling-floor.sh new file mode 100755 index 0000000..6c885fc --- /dev/null +++ b/.github/scripts/check-suite-tooling-floor.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# check-suite-tooling-floor.sh - fail when an example repo's scenario suite +# pins the cascade tooling below the feature floor. +# +# Each cascade-example repo runs its own scenario-suite.yaml, which bootstraps +# a cascade CLI through the setup-cli action to drive its scenarios. That +# bootstrap version is pinned by hand in the suite. Nothing keeps it moving +# forward, so a suite can sit on a release that predates a command the suite +# now invokes, producing an "unknown command" failure deep inside a live fleet +# run. This check compares every suite's pin against the floor and fails fast +# when one has drifted below it, so the drift is caught before a fleet run. +# +# The floor is the latest published stable cascade release: the newest version +# every released command is guaranteed to exist in. A suite pinned at or above +# the floor passes; only a strictly-lower pin fails. A suite that tracks a +# moving ref (for example @main) carries no semver pin and is treated as +# current, so it is never flagged. +# +# Usage: +# check-suite-tooling-floor.sh # floor = latest release +# FLOOR=v0.7.0 check-suite-tooling-floor.sh # explicit floor override +# REPOS="4env 3env" check-suite-tooling-floor.sh # subset of the roster +# +# Environment: +# FLOOR Override the floor version (vX.Y.Z). Empty resolves to the +# latest stable release of the cascade repo. +# REPOS Space-separated example-repo short names to check. Empty uses +# the full roster below. +# FLEET_OWNER GitHub owner of the cascade and example repos (default +# stablekernel). +# +# Requires: gh (authenticated), base64, sort -V. +set -euo pipefail + +OWNER="${FLEET_OWNER:-stablekernel}" + +# Canonical floor-check roster. This mirrors the fleet-e2e repin roster: every +# example repo whose suite installs a pinned cascade CLI belongs here. Keep it +# in sync with the roster in .github/workflows/fleet-e2e.yaml when a repo is +# added or removed. +DEFAULT_REPOS="primary artifact-a artifact-b 4env 3env 2env single-env release-only no-env callbacks rollback-dispatch" + +SUITE_PATH=".github/workflows/scenario-suite.yaml" + +# ver_lt A B: succeed when semver A is strictly lower than semver B. +ver_lt() { + [ "$1" != "$2" ] && \ + [ "$(printf '%s\n%s\n' "$1" "$2" | sort -V | head -n 1)" = "$1" ] +} + +# resolve_floor: echo the newest non-prerelease, non-draft release tag. +resolve_floor() { + gh release list --repo "${OWNER}/cascade" -L 50 \ + --json tagName,isPrerelease,isDraft \ + --jq '.[] | select(.isPrerelease == false and .isDraft == false) | .tagName' \ + | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | sort -V | tail -n 1 +} + +floor="${FLOOR:-}" +if [ -z "$floor" ]; then + floor="$(resolve_floor || true)" +fi +if ! printf '%s' "$floor" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+$'; then + echo "::error::could not resolve a stable floor version (got '${floor:-}')" + exit 1 +fi +echo "Feature floor (latest stable cascade release): ${floor}" +echo "" + +read -ra repos <<< "${REPOS:-$DEFAULT_REPOS}" + +stale="" +skipped="" +for name in "${repos[@]}"; do + slug="${OWNER}/cascade-example-${name}" + content="$(gh api "repos/${slug}/contents/${SUITE_PATH}" --jq '.content' 2>/dev/null \ + | base64 -d 2>/dev/null || true)" + if [ -z "$content" ]; then + skipped="${skipped} ${name}(no-suite)" + continue + fi + + # Extract every semver pin from the suite's setup-cli action ref and its + # version input. A moving ref (for example setup-cli@main) yields no match + # and is skipped rather than flagged. + pins="$(printf '%s' "$content" \ + | grep -oE 'setup-cli@v[0-9]+\.[0-9]+\.[0-9]+|version:[[:space:]]*v[0-9]+\.[0-9]+\.[0-9]+' \ + | grep -oE 'v[0-9]+\.[0-9]+\.[0-9]+' | sort -u || true)" + if [ -z "$pins" ]; then + echo " ${name}: no semver tooling pin (tracks a moving ref); skipped" + skipped="${skipped} ${name}(moving-ref)" + continue + fi + + while IFS= read -r pin; do + [ -n "$pin" ] || continue + if ver_lt "$pin" "$floor"; then + echo " ${name}: ${pin} is BELOW floor ${floor}" + stale="${stale} ${name}:${pin}" + else + echo " ${name}: ${pin} >= floor ${floor}" + fi + done <<< "$pins" +done + +echo "" +if [ -n "$skipped" ]; then + echo "Not pin-checked:${skipped}" +fi + +if [ -n "$stale" ]; then + echo "::error::example-suite tooling pins below floor ${floor}:${stale}" + echo "Bump each listed repo's ${SUITE_PATH} setup-cli pin (both the action" + echo "ref and the version input) to at least ${floor}, then rerun this check." + exit 1 +fi + +echo "All example-repo suite tooling pins are at or above floor ${floor}." diff --git a/.github/workflows/fleet-e2e.yaml b/.github/workflows/fleet-e2e.yaml index 0b08eea..ceacd13 100644 --- a/.github/workflows/fleet-e2e.yaml +++ b/.github/workflows/fleet-e2e.yaml @@ -287,6 +287,29 @@ jobs: echo "| remainder | $RUN_REMAINDER |" } >> "$GITHUB_STEP_SUMMARY" + # Floor check: the repin below points each repo's manifest cli_version at the + # rc under test, but it deliberately leaves each suite's OWN setup-cli + # bootstrap pin alone (it only rewrites prerelease refs). A suite pinned to a + # stable release that predates a command the suite now invokes fails a live + # lane with a cryptic "unknown command" mid-fan-out. This job runs the shared + # floor check before any repin or fan-out so that drift reds the run up front + # with a clear message instead. The suite pin must be at or above the latest + # stable release; a pin at or above the floor (including one ahead of it) + # passes. The daily Suite Tooling Floor workflow runs the same check off the + # release cadence so drift is usually caught before an rc run reaches here. + floor-check: + name: Check suite tooling pins + needs: resolve + runs-on: ubuntu-latest + permissions: + contents: read + env: + GH_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + - name: Check example-suite tooling pins against the floor + run: ./.github/scripts/check-suite-tooling-floor.sh + # Repin: pin every example repo to the rc UNDER TEST before any suite fans # out. Without this the suites would install whatever version each repo's # manifest is statically pinned to, so a fresh rc would never actually run - @@ -296,10 +319,11 @@ jobs: # job gates on this job so none can start against a stale pin. Repin always # covers the full roster regardless of the repos selector: pinning is cheap, # idempotent, and sequential (one repo at a time), so it does not add to live - # fan-out concurrency. + # fan-out concurrency. Gated on floor-check so a below-floor suite pin reds the + # run before any live dispatch. repin: name: Repin fleet to rc - needs: resolve + needs: [resolve, floor-check] runs-on: ubuntu-latest permissions: contents: read @@ -624,7 +648,7 @@ jobs: # over exactly the lanes that ran. A real fan-out failure still reds the run. aggregate: name: Fleet gate - needs: [resolve, plan, repin, primary, dependents, heavy, remainder] + needs: [resolve, plan, floor-check, repin, primary, dependents, heavy, remainder] # Only render a verdict when the fleet actually fanned out. On filtered-out # completions (merge_group, non-rc tags, dispatch with no rc) resolve is # skipped, so this job is skipped too and the run is a clean no-op rather @@ -637,6 +661,7 @@ jobs: steps: - name: Aggregate fleet result env: + R_FLOOR: ${{ needs.floor-check.result }} R_REPIN: ${{ needs.repin.result }} R_PRIMARY: ${{ needs.primary.result }} R_DEPENDENTS: ${{ needs.dependents.result }} @@ -652,6 +677,7 @@ jobs: echo "" echo "| Lane | Result |" echo "|---|---|" + echo "| floor-check (suite tooling pins) | $R_FLOOR |" echo "| repin (all 10 repos to rc) | $R_REPIN |" echo "| primary | $R_PRIMARY |" echo "| dependents (artifact-a, artifact-b) | $R_DEPENDENTS |" @@ -670,13 +696,19 @@ jobs: # A lane passes when it succeeded OR was skipped (filtered out by the # repos selector, or - for dependents - skipped because primary was # not selected). Only an actual failure or cancellation reds the gate. - # repin is never selector-gated, so a non-success repin always reds. + # floor-check and repin are never selector-gated, so a non-success + # result from either always reds. A failed floor-check also skips + # repin, so it must be checked directly here or the skipped repin would + # read as a pass. fail=0 - for r in "$R_REPIN" "$R_PRIMARY" "$R_DEPENDENTS" "$R_HEAVY" "$R_REMAINDER"; do + for r in "$R_FLOOR" "$R_REPIN" "$R_PRIMARY" "$R_DEPENDENTS" "$R_HEAVY" "$R_REMAINDER"; do if [ "$r" != "success" ] && [ "$r" != "skipped" ]; then fail=1 fi done + if [ "$R_FLOOR" != "success" ]; then + fail=1 + fi if [ "$fail" -ne 0 ]; then echo "::error::Fleet E2E failed: one or more lanes did not pass" exit 1 diff --git a/.github/workflows/suite-tooling-floor.yaml b/.github/workflows/suite-tooling-floor.yaml new file mode 100644 index 0000000..22e3b2c --- /dev/null +++ b/.github/workflows/suite-tooling-floor.yaml @@ -0,0 +1,50 @@ +# Suite Tooling Floor - guards the cascade-example fleet against tooling drift. +# +# Every example repo's scenario-suite.yaml bootstraps a cascade CLI through the +# setup-cli action, pinned by hand to a fixed release. Nothing keeps that pin +# moving forward, so a suite can drift onto a release that predates a command it +# now invokes and fail a live fleet run with a cryptic "unknown command". This +# job runs the floor check daily so the drift surfaces on its own schedule, well +# before an rc fleet run trips over it. The same check also gates fleet-e2e.yaml +# before fan-out, so a stale pin is caught at release time too. +# +# The floor is the latest published stable cascade release. A suite pinned at or +# above the floor passes; only a strictly-lower pin fails. A suite that tracks a +# moving ref (for example @main) carries no semver pin and is never flagged. +name: Suite Tooling Floor + +on: + schedule: + # Daily, offset from other scheduled jobs to spread live API load. + - cron: '17 6 * * *' + workflow_dispatch: + inputs: + floor: + description: >- + Override the floor version (e.g. v0.7.0). Empty resolves to the latest + stable cascade release. + required: false + default: '' + +permissions: + contents: read + +concurrency: + group: suite-tooling-floor + cancel-in-progress: true + +jobs: + check: + name: Check example-suite tooling pins + runs-on: ubuntu-latest + permissions: + contents: read + env: + # Cross-repo reads of the example repos' suites use the fleet token, the + # same credential the fleet itself reads and writes those repos with. + GH_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }} + FLOOR: ${{ github.event.inputs.floor }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + - name: Check suite tooling pins against the floor + run: ./.github/scripts/check-suite-tooling-floor.sh