diff --git a/.github/license-check/config.json b/.github/license-check/config.json deleted file mode 100644 index 15b235e41..000000000 --- a/.github/license-check/config.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "include": [ - "**/*.go" - ], - "exclude": [ - "internal/repository/postgres/db/**", - "pkg/contracts/**", - "pkg/inspectclient/generated.go" - ], - "license": ".github/license-check/header.txt" - } -] diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8132b7ead..4ce3c4d4b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: basic-checks: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: recursive fetch-depth: 0 @@ -27,13 +27,10 @@ jobs: check-latest-tag-only: true - name: Check license header - uses: viperproject/check-license-header@v2 - with: - path: ./ - config: .github/license-check/config.json + run: make check-license - name: Lint Markdown docs - uses: DavidAnson/markdownlint-cli2-action@v16 + uses: DavidAnson/markdownlint-cli2-action@ded1f9488f68a970bc66ea5619e13e9b52e601cd # v23 with: globs: | *.md @@ -47,7 +44,7 @@ jobs: packages: write steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive @@ -55,7 +52,7 @@ jobs: run: echo ROLLUPS_NODE_VERSION=`make version` >> $GITHUB_ENV - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} @@ -142,7 +139,7 @@ jobs: run: make copy-debian-package BUILD_PLATFORM=linux/arm64 DEB_ARCH=arm64 DEB_PACKAGER_IMG=${{ github.repository_owner }}/rollups-node:debian-packager-arm64 - name: Upload deb artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: artifacts path: | @@ -159,10 +156,10 @@ jobs: packages: read steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} @@ -185,10 +182,10 @@ jobs: packages: read steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} @@ -207,7 +204,7 @@ jobs: cartesi/rollups-node-devnet:devel - name: Cache test machine images - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: test/downloads key: test-deps-${{ hashFiles('test/dependencies.sha256') }} @@ -218,19 +215,48 @@ jobs: - name: Run unit tests run: make unit-test-with-compose + # Runs the shard coverage guard and derives the integration matrix from the + # Makefile's shard × topology cells, so adding a shard or topology never + # requires a workflow change and an unassigned test fails fast here. + integration-test-setup: + runs-on: ubuntu-24.04 + outputs: + cells: ${{ steps.matrix.outputs.cells }} + steps: + - name: Checkout source code + uses: actions/checkout@v6 + + - name: Setup Go + uses: actions/setup-go@v6 + with: + go-version-file: go.mod + + - name: Check shard coverage + run: make integration-test-shard-check + + - name: List shard x topology cells + id: matrix + run: echo "cells=$(make -s list-integration-cells)" >> "$GITHUB_OUTPUT" + integration-test: runs-on: ubuntu-24.04 - needs: [build] + needs: [build, integration-test-setup] timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: ${{ fromJson(needs.integration-test-setup.outputs.cells) }} + env: + COMPOSE_PROJECT: rollups-it-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.shard }}-${{ matrix.topology }} permissions: contents: read packages: read steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} @@ -249,7 +275,7 @@ jobs: cartesi/rollups-node-devnet:devel - name: Cache test machine images - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: test/downloads key: test-deps-${{ hashFiles('test/dependencies.sha256') }} @@ -257,17 +283,30 @@ jobs: - name: Download test dependencies run: make download-test-dependencies - - name: Run integration tests - run: make integration-test-with-compose + - name: Run integration shard ${{ matrix.shard }} (${{ matrix.topology }}) + run: | + make integration-test-with-compose SHARD=${{ matrix.shard }} \ + NODE_TOPOLOGY=${{ matrix.topology }} \ + COMPOSE_PROJECT="$COMPOSE_PROJECT" - name: Upload integration test logs if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: - name: integration-test-logs - path: integration-logs.txt + name: integration-test-logs-${{ matrix.shard }}-${{ matrix.topology }} + path: integration-logs-${{ matrix.shard }}-${{ matrix.topology }}.txt retention-days: 3 + # Redundant with the trap in compose-integration-run.sh; this is the + # safety net for when concurrency cancel-in-progress SIGKILLs the make + # process before its trap can fire, which would otherwise leak the + # project's containers and volumes. + - name: Clean up compose project + if: always() + run: | + docker compose -p "$COMPOSE_PROJECT" \ + -f test/compose/compose.integration.yaml down -v --remove-orphans || true + publish_artifacts: name: Publish artifacts needs: [basic-checks, build, unit-test, integration-test] @@ -277,13 +316,13 @@ jobs: contents: write steps: - name: Checkout emulator source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Download artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 - name: Upload products to GitHub Release - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@718ea10b132b3b2eba29c1007bb80653f286566b # v3 with: draft: true files: | diff --git a/.github/workflows/clean-up-images.yml b/.github/workflows/clean-up-images.yml index 2ca11d67b..35399eec3 100644 --- a/.github/workflows/clean-up-images.yml +++ b/.github/workflows/clean-up-images.yml @@ -1,11 +1,16 @@ # yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json name: Clean-up Docker images +# build.yml pushes four ci- tagged images to GHCR on every push; this +# periodic sweep is what keeps them from accumulating without bound. on: - pull_request: - branches: - - main - types: [closed] + schedule: + # Approximately every 5 days. cron has no true "every N days", so this + # fires on days-of-month 1,6,11,16,21,26,31 and resets at each month + # start (one short gap at the boundary) — fine for a cleanup with a + # 7-day age floor. + - cron: "0 3 */5 * *" + workflow_dispatch: jobs: cleanup: @@ -19,22 +24,14 @@ jobs: - rollups-node - rollups-node-devnet steps: - # Remove PR-scoped tags immediately. - - name: Prune PR tags - uses: vlaurin/action-ghcr-prune@v0.6.0 - with: - organization: cartesi - container: ${{ matrix.image }} - token: ${{ secrets.GITHUB_TOKEN }} - prune-untagged: false - keep-last: 0 - prune-tags-regexes: | - ^pr-${{ github.event.number }}$ - # Prune stale CI images older than 7 days to avoid deleting # images needed by concurrently running workflows. - name: Prune stale CI tags - uses: vlaurin/action-ghcr-prune@v0.6.0 + # Pinned to a commit: third-party action holding a packages:write + # token. Still node20: no node24 release exists upstream (checked + # 2026-06); works until the runner removes node20 (announced + # 2026-09-16). + uses: vlaurin/action-ghcr-prune@0cf7d39f88546edd31965acba78cdcb0be14d641 # v0.6.0 with: organization: cartesi container: ${{ matrix.image }} diff --git a/.gitignore b/.gitignore index fa3b0e0ae..ac2440d0c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ machine-snapshot/** /applications /test/downloads /snapshots +/integration-logs*.txt diff --git a/Dockerfile b/Dockerfile index 6445e3f59..cfb172621 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,8 +20,10 @@ ARG GO_BUILD_PATH RUN </dev/null || true ; \ + for s in $(INTEGRATION_SHARDS) ; do \ + docker compose -p rollups-node-integration-$$s-$$t -f test/compose/compose.integration.yaml down -v --remove-orphans 2>/dev/null || true ; \ + done ; \ + done + @docker compose -p rollups-node-integration -f test/compose/compose.integration.yaml down -v --remove-orphans 2>/dev/null || true + # ============================================================================= # Tests # ============================================================================= @@ -497,26 +519,171 @@ unit-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run unit tests using d lint-with-docker: ## Run linting inside Docker (no host Go needed) @docker run --rm cartesi/rollups-node:tester sh -c 'make lint && make vet && make fmt-check' -integration-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run integration tests using docker compose with auto-shutdown - @trap 'docker compose -f test/compose/compose.integration.yaml logs --no-color > integration-logs.txt 2>&1 || true; docker compose -f test/compose/compose.integration.yaml down -v || true' EXIT && \ - docker compose -f test/compose/compose.integration.yaml run --rm --remove-orphans integration-test +check-license: ## Verify license headers on Go source files + @scripts/check-license-header.sh + +# ============================================================================= +# Integration test sharding +# ============================================================================= +# Each shard runs as an isolated Docker Compose project (own Anvil, Postgres, +# and test-managed node); tests within a shard stay sequential. The anchors +# (^...$$) are load-bearing: go test -run matches unanchored, and several test +# names are prefixes of others (Foreclose / ForecloseReplay / ForeclosePrt). +# Every top-level test must match exactly one shard; this is enforced by +# `make integration-test-shard-check`. +# +# Shards are grouped by semantic family, not balanced by runtime: `withdrawal` +# is a single test while `restart` and `replay` are the heaviest. Each shard +# gets its own CI runner and the full per-job `go test -timeout 55m` +# (run-integration-tests.sh) budget; `restart` (multi-suite, ~25-min setup +# contexts) is the first to watch if a shard ever approaches that ceiling. +# Discovery (integration-test-shard-check) lists tests with a plain Go +# toolchain, so the integration package must stay free of the Cartesi CGo +# dependency for the check to build on the CI setup runner. +INTEGRATION_SHARDS := basic quorum prt replay restart withdrawal + +INTEGRATION_SHARD_basic := ^Test(EchoAuthority|RejectException|MultiApp|EchoAuthorityStaging)$$ +INTEGRATION_SHARD_quorum := ^Test(EchoQuorum|SameBlockInputs)$$ +INTEGRATION_SHARD_prt := ^Test(EchoPrt|RejectExceptionPrt|ForeclosePrt)$$ +INTEGRATION_SHARD_replay := ^Test(Foreclose|ForecloseReplay|DivergentClaim)$$ +INTEGRATION_SHARD_restart := ^Test(Restart|SnapshotPolicy)$$ +INTEGRATION_SHARD_withdrawal := ^TestWithdrawalLifecycle$$ + +# ----------------------------------------------------------------------------- +# Node topology axis — orthogonal to shards. +# ----------------------------------------------------------------------------- +# A shard selects WHICH tests run; a topology selects HOW the node is deployed. +# CI runs the (shard, topology) cells in parallel. Both topologies are +# test-managed (TestMain starts and can restart them), so both run every shard. +# standalone — the all-in-one cartesi-rollups-node process. +# multiprocess — one OS process per service (evm-reader, advancer, validator, +# claimer, prt, jsonrpc-api) sharing Postgres, started as +# subprocesses by TestMain (on the host, or inside the test +# container under compose). See test/integration/multinode_helpers_test.go. +# +# Applicability is per-topology data (INTEGRATION_SHARDS_). multiprocess +# runs the SAME shards as standalone — the node-lifecycle tests stop/start the +# whole service set via the topology-aware harness. Killing a single service +# (partial failure) is separate future fault-injection work. +INTEGRATION_TOPOLOGIES := standalone multiprocess +NODE_TOPOLOGY ?= standalone + +INTEGRATION_SHARDS_standalone := $(INTEGRATION_SHARDS) +INTEGRATION_SHARDS_multiprocess := $(INTEGRATION_SHARDS) + +# The CI matrix is the set of (shard, topology) cells, encoded "shard:topology". +INTEGRATION_CELLS := $(foreach t,$(INTEGRATION_TOPOLOGIES),$(foreach s,$(INTEGRATION_SHARDS_$(t)),$(s):$(t))) + +COMPOSE_PROJECT ?= rollups-node-integration +INTEGRATION_LOGS ?= integration-logs.txt +INTEGRATION_TEST_JOBS ?= 3 + +# String helpers for building run patterns / matrices. +comma := , +empty := +space := $(empty) $(empty) + +# --- Selection driven by NODE_TOPOLOGY and SHARD ---------------------------- +# Selected topologies: NODE_TOPOLOGY (default standalone), a space-separated +# list, or the sugar value `all`. +TOPOLOGIES_SELECTED = $(if $(filter all,$(NODE_TOPOLOGY)),$(INTEGRATION_TOPOLOGIES),$(NODE_TOPOLOGY)) +# shards_for(topology): the SHARD filter (or all) intersected with what the +# topology supports (INTEGRATION_SHARDS_). +shards_for = $(filter $(if $(strip $(SHARD)),$(SHARD),$(INTEGRATION_SHARDS_$(1))),$(INTEGRATION_SHARDS_$(1))) +# run_pattern(topology): the selected shards' -run regexes as one alternation. +run_pattern = $(subst $(space),|,$(strip $(foreach s,$(call shards_for,$(1)),$(INTEGRATION_SHARD_$(s))))) +# Selected (shard:topology) cells, for PARALLEL fan-out. +SELECTED_CELLS = $(foreach t,$(TOPOLOGIES_SELECTED),$(foreach s,$(call shards_for,$(t)),$(s):$(t))) +# Label for project/log names: the SHARD filter joined by '-', or "all". +SUITE_LABEL = $(if $(strip $(SHARD)),$(subst $(space),-,$(strip $(SHARD))),all) + +# Validate NODE_TOPOLOGY / SHARD at parse time for the two entry points so a +# bad invocation fails before any prerequisite work (downloading images, etc). +ifneq ($(filter integration-test-with-compose integration-test-local,$(MAKECMDGOALS)),) +$(foreach t,$(TOPOLOGIES_SELECTED),$(if $(filter $(t),$(INTEGRATION_TOPOLOGIES)),,$(error unknown topology '$(t)'. Known topologies: $(INTEGRATION_TOPOLOGIES)))) +$(foreach s,$(SHARD),$(if $(filter $(s),$(INTEGRATION_SHARDS)),,$(error unknown shard '$(s)'. Known shards: $(INTEGRATION_SHARDS)))) +endif + +# PARALLEL is compose-only: the host node binds fixed ports. Fail at parse time, +# before the (slow) build prerequisites of integration-test-local. +ifneq ($(filter integration-test-local,$(MAKECMDGOALS)),) +ifeq ($(PARALLEL),true) +$(error PARALLEL is not supported for integration-test-local (the host node binds fixed ports 10000/10011/10012). Use 'integration-test-with-compose PARALLEL=true') +endif +endif + +# ============================================================================= +# Two entry points. Both honor: +# NODE_TOPOLOGY one topology (default standalone), a list, or `all` +# SHARD restrict to a subset of shards (default: all applicable) +# PARALLEL =true runs cells concurrently (compose only) +# A requested-but-inapplicable (shard, topology) is skipped with a message. +# ============================================================================= + +integration-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run integration tests via docker compose (NODE_TOPOLOGY=, SHARD=, PARALLEL=true) +ifeq ($(PARALLEL),true) + @$(MAKE) -k -j $(INTEGRATION_TEST_JOBS) $(addprefix _compose-cell-,$(SELECTED_CELLS)) +else + @set -e; for t in $(TOPOLOGIES_SELECTED); do $(MAKE) _compose-topology-$$t; done +endif + +# One topology: the union of selected shards in one compose project, sequential. +_compose-topology-%: + @pattern='$(call run_pattern,$*)'; \ + if [ -z "$$pattern" ]; then echo "skip: no applicable shards for topology '$*' (SHARD filter excludes all)"; exit 0; fi; \ + COMPOSE_PROJECT='$(if $(filter rollups-node-integration,$(COMPOSE_PROJECT)),rollups-node-integration-$(SUITE_LABEL)-$*,$(COMPOSE_PROJECT))' \ + INTEGRATION_LOGS='integration-logs-$(SUITE_LABEL)-$*.txt' \ + TEST_PATTERN="$$pattern" SHARD_NAME='$(SUITE_LABEL)-$*' NODE_TOPOLOGY='$*' \ + GOTESTSUM_FORMAT='$(GOTESTSUM_FORMAT)' \ + scripts/compose-integration-run.sh + +# One (shard:topology) cell in its own project — PARALLEL fan-out target. +_compose-cell-%: + @COMPOSE_PROJECT='$(if $(filter rollups-node-integration,$(COMPOSE_PROJECT)),rollups-node-integration-$(firstword $(subst :, ,$*))-$(lastword $(subst :, ,$*)),$(COMPOSE_PROJECT))' \ + INTEGRATION_LOGS='integration-logs-$(firstword $(subst :, ,$*))-$(lastword $(subst :, ,$*)).txt' \ + TEST_PATTERN='$(INTEGRATION_SHARD_$(firstword $(subst :, ,$*)))' \ + SHARD_NAME='$(firstword $(subst :, ,$*))' \ + NODE_TOPOLOGY='$(lastword $(subst :, ,$*))' \ + GOTESTSUM_FORMAT='$(GOTESTSUM_FORMAT)' \ + scripts/compose-integration-run.sh + +integration-test-shard-check: ## Verify every integration test belongs to exactly one shard + @scripts/check-integration-shards.sh \ + $(foreach s,$(INTEGRATION_SHARDS),'$(s)=$(INTEGRATION_SHARD_$(s))') + +list-integration-shards: ## Print integration shard names as a JSON array + @echo '[$(subst $(space),$(comma),$(patsubst %,"%",$(INTEGRATION_SHARDS)))]' + +list-integration-cells: ## Print shard×topology cells as a JSON array of {shard,topology} (for the CI matrix) + @printf '['; sep=''; \ + for cell in $(INTEGRATION_CELLS); do \ + printf '%s{"shard":"%s","topology":"%s"}' "$$sep" "$${cell%%:*}" "$${cell##*:}"; \ + sep=','; \ + done; \ + printf ']\n' test-with-compose: ## Run all tests using docker compose with auto-shutdown @$(MAKE) unit-test-with-compose @$(MAKE) integration-test-with-compose -integration-test-local: build cartesi-rollups-machine-tool echo-dapp reject-loop-dapp exception-loop-dapp erc20-withdrawal-dapp ## Run integration tests locally (requires: make start && eval $$(make env)) - @cartesi-rollups-cli db init - @if lsof -ti:10000 >/dev/null 2>&1; then \ - echo "Killing stale node on port 10000..."; \ - kill $$(lsof -ti:10000) 2>/dev/null || true; \ - sleep 2; \ - fi - @export CARTESI_TEST_DAPP_PATH=$(CURDIR)/applications/echo-dapp; \ - export CARTESI_TEST_REJECT_DAPP_PATH=$(CURDIR)/applications/reject-loop-dapp; \ - export CARTESI_TEST_EXCEPTION_DAPP_PATH=$(CURDIR)/applications/exception-loop-dapp; \ - export CARTESI_TEST_ERC20_WITHDRAWAL_DAPP_PATH=$(CURDIR)/applications/erc20-withdrawal-dapp; \ - $(MAKE) integration-test +integration-test-local: build cartesi-rollups-machine-tool echo-dapp reject-loop-dapp exception-loop-dapp erc20-withdrawal-dapp ## Run integration tests on the host (NODE_TOPOLOGY=, SHARD=; requires: make start && eval $$(make env)) + @set -e; first=1; for t in $(TOPOLOGIES_SELECTED); do \ + if [ "$$first" = 1 ]; then first=0; else echo "=== resetting dev DB + devnet between topologies ==="; $(MAKE) restart; fi; \ + $(MAKE) _local-topology-$$t; \ + done + +# One topology on the host: in-process node (standalone) or service subprocesses +# (multiprocess, via TestMain reading NODE_TOPOLOGY). +_local-topology-%: + @pattern='$(call run_pattern,$*)'; \ + if [ -z "$$pattern" ]; then echo "skip: no applicable shards for topology '$*' (SHARD filter excludes all)"; exit 0; fi; \ + cartesi-rollups-cli db init; \ + for b in cartesi-rollups-node cartesi-rollups-evm-reader cartesi-rollups-advancer cartesi-rollups-validator cartesi-rollups-claimer cartesi-rollups-jsonrpc-api cartesi-rollups-prt; do pkill -x $$b 2>/dev/null || true; done; \ + export CARTESI_TEST_DAPP_PATH=$(CURDIR)/applications/echo-dapp; \ + export CARTESI_TEST_REJECT_DAPP_PATH=$(CURDIR)/applications/reject-loop-dapp; \ + export CARTESI_TEST_EXCEPTION_DAPP_PATH=$(CURDIR)/applications/exception-loop-dapp; \ + export CARTESI_TEST_ERC20_WITHDRAWAL_DAPP_PATH=$(CURDIR)/applications/erc20-withdrawal-dapp; \ + NODE_TOPOLOGY='$*' TEST_PATTERN="$$pattern" $(MAKE) integration-test deploy-load-test-apps: applications/echo-dapp ## Deploy 3 echo-dapp instances for load testing @echo "Deploying load-test apps (3 echo-dapps with different salts)..." @@ -574,11 +741,12 @@ build-debian-package: install .PHONY: \ build build-go $(GO_ARTIFACTS) cartesi-rollups-machine-tool \ - clean clean-go clean-contracts clean-docs clean-devnet-files clean-dapps clean-test-dependencies clean-debian-packages \ + clean clean-go clean-contracts clean-docs clean-devnet-files clean-dapps clean-test-dependencies clean-test-logs clean-integration-compose clean-debian-packages \ test unit-test unit-test-with-compose integration-test integration-test-with-compose integration-test-local test-with-compose ci-test coverage-report \ + integration-test-shard-check list-integration-shards list-integration-cells \ generate generate-contracts generate-config generate-inspect check-generate generate-db \ docs generate-cli-docs generate-config-docs \ - lint fmt fmt-check vet escape \ + lint fmt fmt-check vet escape check-license \ devnet image tester-image debian-packager run-with-compose shutdown-compose \ start start-devnet start-postgres stop stop-devnet stop-postgres restart restart-devnet restart-postgres \ install copy-debian-package build-debian-package \ diff --git a/README.md b/README.md index 8d6500da5..891bd548e 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ We provide packages for debian (.deb) in **amd64** and **arm64** variants on the - GNU Make >= 3.81 - Go >= 1.24.1 -Follow the Cartesi Machine installation instructions [here](https://github.com/cartesi/machine-emulator?tab=readme-ov-file#installation). +Follow the [Cartesi Machine installation instructions](https://github.com/cartesi/machine-emulator?tab=readme-ov-file#installation). ##### Build diff --git a/internal/claimer/foreclosed_apps_test.go b/internal/claimer/foreclosed_apps_test.go index 6a3ab099e..0a88324bd 100644 --- a/internal/claimer/foreclosed_apps_test.go +++ b/internal/claimer/foreclosed_apps_test.go @@ -149,15 +149,14 @@ func TestProcessForeclosedApps_DrainCheckErrorsAppendAndContinue(t *testing.T) { app2 := foreclosedAppHelper(2, 100, model.Consensus_Authority) for _, app := range []*model.Application{app1, app2} { - r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", - mock.Anything, app.ID, app.ForecloseBlock, - ).Return(0, nil).Once() r.On("HasUndrainedEpochsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(false, errors.New("db unavailable")).Once() } - // HasUnreconciledClaimsBeforeBlock must not be reached for either app — the - // undrained check errored and the per-app branch `continue`d. No expectation. + // The drain check runs first; its error makes the per-app branch `continue` + // before terminalizing or reconciling. Neither + // ForecloseUnacceptedEpochsAtOrAfterBlock nor HasUnreconciledClaimsBeforeBlock + // is reached — no expectation registered for either. errs := s.processForeclosedApps(map[int64]*model.Application{app1.ID: app1, app2.ID: app2}) assert.Len(t, errs, 2, "each app's drain error is appended; the pass does not abort early") @@ -196,6 +195,15 @@ func TestProcessForeclosedApps_NoTransitionWhenDrained(t *testing.T) { assert.Empty(t, errs) } +// TestProcessForeclosedApps_DefersWhenInputsUndrained verifies the drain-gate +// ordering that protects an input landing in the foreclose block. While any +// pre-foreclosure input is still undrained, the pass defers WITHOUT +// terminalizing. Terminalizing the straddling epoch first would flip it to +// CLAIM_FORECLOSED and strand its unprocessed same-block input (it would vanish +// from both this drain check and the manager's machine-drain gate, and the +// machine would be torn down before advancing it). The absent +// ForecloseUnacceptedEpochsAtOrAfterBlock expectation is the regression guard: +// testify/mock fails on the unexpected call if terminalization runs too early. func TestProcessForeclosedApps_DefersWhenInputsUndrained(t *testing.T) { s, r, _ := newServiceMock() defer r.AssertExpectations(t) @@ -203,35 +211,41 @@ func TestProcessForeclosedApps_DefersWhenInputsUndrained(t *testing.T) { app := foreclosedAppHelper(1, 100, model.Consensus_Authority) s.Context = context.Background() - r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", - mock.Anything, app.ID, app.ForecloseBlock, - ).Return(0, nil).Once() r.On("HasUndrainedEpochsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(true, nil).Once() - // No HasUnreconciledClaimsBeforeBlock expectation — unresolved inputs - // must stop the drain check before claim-state reconciliation. + // No ForecloseUnacceptedEpochsAtOrAfterBlock and no + // HasUnreconciledClaimsBeforeBlock: an undrained input defers the whole pass + // before terminalization and before claim reconciliation. errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs, "input-drain deferral is not an error") } -func TestProcessForeclosedApps_TerminalizesUnacceptedOverlapBeforeDrain(t *testing.T) { +// TestProcessForeclosedApps_TerminalizesUnacceptedOverlapAfterDrain verifies the +// other side of the gate: once the drain check clears (no undrained inputs), the +// straddling/after epochs that can never be accepted are terminalized to +// CLAIM_FORECLOSED, then reconciliation completes. +func TestProcessForeclosedApps_TerminalizesUnacceptedOverlapAfterDrain(t *testing.T) { s, r, _ := newServiceMock() defer r.AssertExpectations(t) app := foreclosedAppHelper(1, 100, model.Consensus_Authority) s.Context = context.Background() - r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", - mock.Anything, app.ID, app.ForecloseBlock, - ).Return(2, nil).Once() - r.On("HasUndrainedEpochsBeforeBlock", + // Pin the sequence: the drain check MUST run before terminalization (else a + // straddling-epoch input is stranded — the bug this ordering prevents), and + // terminalization before the claim-reconciliation check. + drain := r.On("HasUndrainedEpochsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(false, nil).Once() - r.On("HasUnreconciledClaimsBeforeBlock", + terminalize := r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", + mock.Anything, app.ID, app.ForecloseBlock, + ).Return(2, nil).Once() + reconcile := r.On("HasUnreconciledClaimsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(false, nil).Once() + mock.InOrder(drain, terminalize, reconcile) errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs) diff --git a/internal/claimer/foreclosure.go b/internal/claimer/foreclosure.go index b7b4010b8..e8bccd81e 100644 --- a/internal/claimer/foreclosure.go +++ b/internal/claimer/foreclosure.go @@ -80,41 +80,52 @@ func (s *Service) processForeclosedApps( ) continue } - terminalized, err := s.repository.ForecloseUnacceptedEpochsAtOrAfterBlock( + // Drain gate FIRST, terminalize second. An input can land in the + // foreclose block itself (before the foreclose tx, so it is valid and is + // indexed up to and including foreclose_block). Terminalizing the + // straddling epoch before that input is advanced would flip the epoch to + // CLAIM_FORECLOSED, which hides its still-unprocessed input from this + // drain check AND from the manager's machine-drain gate + // (HasUndrainedEpochsBeforeBlock excludes terminal epochs) — so the + // machine is torn down and the input is never processed, leaving the + // final machine state one input behind the chain. Wait for the drain, + // then terminalize. PRT gates terminalization the same way + // (internal/prt/service.go handleForeclosedApp). + undrained, err := s.repository.HasUndrainedEpochsBeforeBlock( s.Context, app.ID, app.ForecloseBlock, ) if err != nil { errs = append(errs, fmt.Errorf( - "terminalizing unaccepted epochs for foreclosed app %s: %w", + "checking input drain progress for foreclosed app %s: %w", app.IApplicationAddress, err)) continue } - if terminalized > 0 { + if undrained { s.Logger.Info( - "Foreclosed application terminalized epochs that cannot be accepted", + "Foreclosed application still advancing pre-foreclosure inputs", "application", app.Name, "address", app.IApplicationAddress, "foreclose_block", app.ForecloseBlock, - "epochs", terminalized, ) + continue } - undrained, err := s.repository.HasUndrainedEpochsBeforeBlock( + terminalized, err := s.repository.ForecloseUnacceptedEpochsAtOrAfterBlock( s.Context, app.ID, app.ForecloseBlock, ) if err != nil { errs = append(errs, fmt.Errorf( - "checking input drain progress for foreclosed app %s: %w", + "terminalizing unaccepted epochs for foreclosed app %s: %w", app.IApplicationAddress, err)) continue } - if undrained { + if terminalized > 0 { s.Logger.Info( - "Foreclosed application still advancing pre-foreclosure inputs", + "Foreclosed application terminalized epochs that cannot be accepted", "application", app.Name, "address", app.IApplicationAddress, "foreclose_block", app.ForecloseBlock, + "epochs", terminalized, ) - continue } unreconciled, err := s.repository.HasUnreconciledClaimsBeforeBlock( s.Context, app.ID, app.ForecloseBlock, diff --git a/scripts/check-integration-shards.sh b/scripts/check-integration-shards.sh new file mode 100755 index 000000000..ffffe4a7c --- /dev/null +++ b/scripts/check-integration-shards.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# (c) Cartesi and individual authors (see AUTHORS) +# SPDX-License-Identifier: Apache-2.0 (see LICENSE) +# +# Shard coverage guard: verifies that every top-level integration test +# belongs to exactly one shard and that every shard matches at least one +# test. Prevents false greens when a new integration test is added but not +# assigned to a shard. +# +# Usage: check-integration-shards.sh = [= ...] +# +# Test discovery uses `go test -list`, which builds the package and runs +# TestMain (TestMain skips node management when listing). The package has no +# CGo dependency on the Cartesi C library, so a plain Go toolchain suffices. + +set -euo pipefail + +cd "$(dirname "$0")/.." + +if [ "$#" -lt 1 ]; then + echo "usage: $0 = [= ...]" >&2 + exit 1 +fi + +SHARD_NAMES=() +SHARD_REGEXES=() +for arg in "$@"; do + name="${arg%%=*}" + regex="${arg#*=}" + if [ -z "$name" ] || [ -z "$regex" ] || [ "$name" = "$arg" ]; then + echo "ERROR: malformed shard spec '$arg' (expected =)" >&2 + exit 1 + fi + SHARD_NAMES+=("$name") + SHARD_REGEXES+=("$regex") +done + +# Discover top-level tests. Keep the build/list step separate from the grep so +# a build failure surfaces as a build failure — otherwise the grep swallows the +# empty output and the script misreports it as "no tests discovered". +if ! list_output=$(go test -list '^Test' -tags=endtoendtests ./test/integration/... 2>&1); then + echo "ERROR: failed to list integration tests (build error?):" >&2 + echo "$list_output" >&2 + exit 1 +fi + +# Filter out the trailing "ok " summary line of -list. +TESTS=$(printf '%s\n' "$list_output" | grep -E '^Test[A-Za-z0-9_]*$' || true) + +if [ -z "$TESTS" ]; then + echo "ERROR: no top-level integration tests discovered" >&2 + exit 1 +fi + +fail=0 + +while IFS= read -r t; do + [ -n "$t" ] || continue + count=0 + matched="" + for i in "${!SHARD_NAMES[@]}"; do + if printf '%s\n' "$t" | grep -Eq -- "${SHARD_REGEXES[$i]}"; then + count=$((count + 1)) + matched="$matched ${SHARD_NAMES[$i]}" + fi + done + if [ "$count" -eq 0 ]; then + echo "ERROR: test $t matches no shard" >&2 + fail=1 + elif [ "$count" -gt 1 ]; then + echo "ERROR: test $t matches multiple shards:$matched" >&2 + fail=1 + fi +done <<<"$TESTS" + +for i in "${!SHARD_NAMES[@]}"; do + if ! printf '%s\n' "$TESTS" | grep -Eq -- "${SHARD_REGEXES[$i]}"; then + echo "ERROR: shard ${SHARD_NAMES[$i]} (${SHARD_REGEXES[$i]}) matches no tests" >&2 + fail=1 + fi +done + +if [ "$fail" -ne 0 ]; then + echo "FAIL: shard coverage check failed" >&2 + exit 1 +fi + +echo "OK: $(printf '%s\n' "$TESTS" | wc -l | tr -d ' ') tests covered by ${#SHARD_NAMES[@]} shards" diff --git a/scripts/check-license-header.sh b/scripts/check-license-header.sh new file mode 100755 index 000000000..e587e84fe --- /dev/null +++ b/scripts/check-license-header.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# (c) Cartesi and individual authors (see AUTHORS) +# SPDX-License-Identifier: Apache-2.0 (see LICENSE) +# +# Verifies that every tracked Go source file carries the license header +# (.github/license-check/header.txt) as a contiguous, in-order block within +# its first lines. The block may be preceded by a "Code generated ... DO NOT +# EDIT." preamble. Generated code that lacks the header entirely is excluded. +# +# Usage: check-license-header.sh + +set -euo pipefail + +cd "$(dirname "$0")/.." + +HEADER_FILE=".github/license-check/header.txt" +TOP_LINES=10 + +# has_header : succeeds iff the exact header block appears, in order and +# contiguous, somewhere within the first TOP_LINES lines of . +# +# The previous implementation counted how many of the first lines matched any +# header line and compared the count to the header length. That was order- and +# uniqueness-blind: a reversed header passed, and so did a file with the +# copyright line duplicated but the SPDX line missing. We compare positionally +# instead so the header must appear verbatim and in the right order. +has_header() { + head -n "$TOP_LINES" "$1" | awk -v hdr="$HEADER_FILE" ' + BEGIN { n = 0; while ((getline line < hdr) > 0) h[++n] = line } + { buf[NR] = $0 } + END { + for (i = 1; i + n - 1 <= NR; i++) { + ok = 1 + for (j = 1; j <= n; j++) + if (buf[i + j - 1] != h[j]) { ok = 0; break } + if (ok) exit 0 + } + exit 1 + }' +} + +fail=0 +while IFS= read -r f; do + if ! has_header "$f"; then + echo "ERROR: missing or wrong license header: $f" >&2 + fail=1 + fi +done < <(git ls-files '*.go' \ + ':!internal/repository/postgres/db' \ + ':!pkg/contracts' \ + ':!pkg/inspectclient/generated.go') + +if [ "$fail" -ne 0 ]; then + echo "FAIL: license header check failed (expected header below)" >&2 + cat "$HEADER_FILE" >&2 + exit 1 +fi + +echo "OK: license headers present" diff --git a/scripts/compose-integration-run.sh b/scripts/compose-integration-run.sh new file mode 100755 index 000000000..01653d78a --- /dev/null +++ b/scripts/compose-integration-run.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# (c) Cartesi and individual authors (see AUTHORS) +# SPDX-License-Identifier: Apache-2.0 (see LICENSE) +# +# Runs the integration-test Compose service inside an isolated Compose +# project and captures all logs (run output, node log, service logs) into a +# single log file. Cleanup is always project-scoped so concurrent shards do +# not interfere with each other. +# +# Usage: compose-integration-run.sh +# +# Environment: +# COMPOSE_PROJECT Compose project name (required) +# INTEGRATION_LOGS Log file to write (required; truncated at start) +# TEST_PATTERN Optional anchored regex selecting a shard of top-level +# tests (forwarded to the test container; empty = full suite) +# SHARD_NAME Optional shard label (log readability only) +# NODE_TOPOLOGY Node deployment topology (standalone | multiprocess); +# forwarded to the container, where TestMain starts and +# manages the matching node. + +set -euo pipefail + +COMPOSE_FILE="test/compose/compose.integration.yaml" +NODE_LOG_PATH="/var/lib/cartesi-rollups-node/logs/node.log" + +: "${COMPOSE_PROJECT:?COMPOSE_PROJECT is required}" +: "${INTEGRATION_LOGS:?INTEGRATION_LOGS is required}" +export TEST_PATTERN="${TEST_PATTERN:-}" +export SHARD_NAME="${SHARD_NAME:-full}" +export NODE_TOPOLOGY="${NODE_TOPOLOGY:-standalone}" + +compose() { + docker compose -p "$COMPOSE_PROJECT" -f "$COMPOSE_FILE" "$@" +} + +cleanup() { + # The in-container trap already prints the node log into the run output; + # this volume copy covers abnormal exits (e.g. an OOM-killed container). + { + echo + echo "=== NODE LOG (from volume) ===" + } >>"$INTEGRATION_LOGS" + compose run --rm --no-deps --entrypoint cat integration-test \ + "$NODE_LOG_PATH" >>"$INTEGRATION_LOGS" 2>/dev/null || true + { + echo + echo "=== COMPOSE SERVICE LOGS ===" + } >>"$INTEGRATION_LOGS" + compose logs --no-color >>"$INTEGRATION_LOGS" 2>&1 || true + compose down -v --remove-orphans || true +} +trap cleanup EXIT + +: >"$INTEGRATION_LOGS" +echo "Running integration tests (project=$COMPOSE_PROJECT shard=$SHARD_NAME topology=$NODE_TOPOLOGY logs=$INTEGRATION_LOGS)" + +# pipefail keeps the test exit code authoritative despite the tee. +compose run --rm --remove-orphans integration-test 2>&1 | tee -a "$INTEGRATION_LOGS" diff --git a/scripts/run-integration-tests.sh b/scripts/run-integration-tests.sh index 8b00fdbdf..e0a6dab13 100755 --- a/scripts/run-integration-tests.sh +++ b/scripts/run-integration-tests.sh @@ -7,8 +7,13 @@ # so this script only needs to set up PATH and run the tests. # # Usage: run-integration-tests.sh +# +# Environment: +# TEST_PATTERN Optional anchored regex passed to go test -run to select a +# shard of top-level tests. Empty means run the full suite. +# SHARD_NAME Optional shard label, used only for log readability. -set -eu +set -euo pipefail export PATH="/opt/go/bin:/build/cartesi/go/rollups-node:$PATH" @@ -20,20 +25,73 @@ if ! command -v cartesi-rollups-machine-tool >/dev/null 2>&1; then fi which cartesi-rollups-machine-tool || { echo "ERROR: cartesi-rollups-machine-tool not found on PATH"; exit 1; } -# Print the node log on exit so it appears in docker compose logs. NODE_LOG="${CARTESI_TEST_NODE_LOG_FILE:-}" -if [ -n "$NODE_LOG" ]; then - trap 'echo "=== NODE LOG ==="; cat "$NODE_LOG" 2>/dev/null || true' EXIT +REPORT="$(mktemp)" + +cleanup() { + # Print the node log on exit so it appears in docker compose logs. + if [ -n "$NODE_LOG" ]; then + echo "=== NODE LOG ===" + cat "$NODE_LOG" 2>/dev/null || true + fi + rm -f "$REPORT" +} +trap cleanup EXIT + +# A skipped top-level test is not a pass. In the compose/CI topology the node +# is always test-managed, so an entire top-level test skipping (e.g. TestRestart +# deciding the node looks externally managed) means the shard reported success +# without exercising what it exists to cover — a false green. Suite/subtest +# skips are allowed; only whole top-level Test* functions are checked. +report_skips() { + if [ -n "$1" ]; then + echo "ERROR: top-level test(s) skipped in shard '${SHARD_NAME:-full}' (a skip is not a pass):" >&2 + echo "$1" | sed 's/^/ - /' >&2 + return 1 + fi + return 0 +} + +# Parse skipped top-level tests from a go test -json event stream. The +# "Test":"Test..." match deliberately excludes names containing '/', so +# subtest skips (e.g. TestEchoQuorum/Foo) are ignored. +toplevel_skips_json() { + grep '"Action":"skip"' "$1" 2>/dev/null \ + | grep -oE '"Test":"Test[A-Za-z0-9_]*"' \ + | sed -E 's/.*"Test":"([^"]*)".*/\1/' \ + | sort -u || true +} + +# Parse skipped top-level tests from captured `go test -v` output. Top-level +# SKIP lines start at column 0; subtest SKIP lines are indented. +toplevel_skips_verbose() { + grep -E '^--- SKIP: Test' "$1" 2>/dev/null \ + | sed -E 's/^--- SKIP: (Test[A-Za-z0-9_]*).*/\1/' \ + | sort -u || true +} + +# Shard selection: a non-empty TEST_PATTERN narrows the run to the matching +# top-level tests. Built as a bash array so the pattern is never re-expanded +# by the shell. +GO_TEST_ARGS=(-count=1 -v -timeout 55m -ldflags "-r /opt/cartesi/lib" -tags=endtoendtests) +if [ -n "${TEST_PATTERN:-}" ]; then + echo "Running integration shard '${SHARD_NAME:-unnamed}' with -run '${TEST_PATTERN}'" + GO_TEST_ARGS+=(-run "${TEST_PATTERN}") fi +GO_TEST_ARGS+=(./test/integration/...) # Timeout must be less than the CI job timeout-minutes (60) to produce # a useful go test panic instead of an abrupt CI kill. +status=0 if command -v gotestsum >/dev/null 2>&1; then - gotestsum --format testdox -- -count=1 -v -timeout 55m \ - -ldflags "-r /opt/cartesi/lib" \ - -tags=endtoendtests ./test/integration/... + # --jsonfile captures the machine-readable event stream alongside the + # human-readable --format output, so we can post-check for skipped tests. + gotestsum --jsonfile "$REPORT" --format "${GOTESTSUM_FORMAT:-testdox}" \ + -- "${GO_TEST_ARGS[@]}" || status=$? + report_skips "$(toplevel_skips_json "$REPORT")" || status=1 else - go test -count=1 -v -timeout 55m \ - -ldflags "-r /opt/cartesi/lib" \ - -tags=endtoendtests ./test/integration/... + go test "${GO_TEST_ARGS[@]}" | tee "$REPORT" || status=$? + report_skips "$(toplevel_skips_verbose "$REPORT")" || status=1 fi + +exit "$status" diff --git a/test/compose/compose.integration.yaml b/test/compose/compose.integration.yaml index 7e8ef715d..3d305f338 100644 --- a/test/compose/compose.integration.yaml +++ b/test/compose/compose.integration.yaml @@ -109,6 +109,15 @@ services: restart: "no" environment: <<: *env + # Shard selection (empty = full suite); see scripts/run-integration-tests.sh. + TEST_PATTERN: ${TEST_PATTERN:-} + SHARD_NAME: ${SHARD_NAME:-full} + # Node topology: standalone (all-in-one) or multiprocess (per-service + # subprocesses). TestMain starts and manages either in this container. + NODE_TOPOLOGY: ${NODE_TOPOLOGY:-standalone} + # testdox prints one line per completed test; standard-verbose streams + # go test -v output live (VERBOSE=true in the Makefile selects it). + GOTESTSUM_FORMAT: ${GOTESTSUM_FORMAT:-testdox} CARTESI_BLOCKCHAIN_DEFAULT_BLOCK: latest CARTESI_TEST_DAPP_PATH: /var/lib/cartesi-rollups-node/dapps/echo-dapp CARTESI_TEST_REJECT_DAPP_PATH: /var/lib/cartesi-rollups-node/dapps/reject-loop-dapp diff --git a/test/integration/main_test.go b/test/integration/main_test.go index 8595d6756..3c8b4b5e0 100644 --- a/test/integration/main_test.go +++ b/test/integration/main_test.go @@ -15,17 +15,17 @@ import ( "time" ) -// TestMain manages the node process and enforces sequential test execution. +// TestMain manages the node and enforces sequential test execution. // -// If no node is already running on port 10000 (e.g., in Docker Compose), -// TestMain starts the node binary as a subprocess, waits for health, and -// stops it after all tests complete. This makes the node lifecycle -// transparent to individual test suites — they don't need to know whether -// the node was started by the test or by an external process. +// Unless a node is already running on port 10000, TestMain starts the +// test-managed node — the all-in-one process (standalone) or the service +// subprocesses (multiprocess, NODE_TOPOLOGY=multiprocess) — waits for health, +// and stops it after all tests complete. This keeps the node lifecycle +// transparent to the suites. // -// Restart/snapshot tests call stopSharedNode/startSharedNode to exercise -// the node's synchronization path. When the node is externally managed -// (Compose), those tests are skipped. +// Restart/snapshot tests call stopSharedNode/startSharedNode to exercise the +// node's synchronization path; this works under either topology. They are +// skipped only when an external node is already running (not test-managed). func TestMain(m *testing.M) { flag.Parse() if testing.Short() { @@ -33,6 +33,11 @@ func TestMain(m *testing.M) { os.Exit(0) } + // -list only builds and lists tests; skip node management entirely. + if l := flag.Lookup("test.list"); l != nil && l.Value.String() != "" { + os.Exit(m.Run()) + } + // Enforce sequential execution — tests share blockchain state. p := flag.Lookup("test.parallel") if p != nil && p.Value.String() != "1" { @@ -46,63 +51,52 @@ func TestMain(m *testing.M) { } } - // Start the node if none is running (local execution). - // In Docker Compose, the node is a separate container and is already - // running — we detect this by checking if port 10000 is in use. - if nodePortAvailable() { - artifactsDir, err := integrationArtifactsDir() + // The node is started here by TestMain (the Compose integration service runs + // this same test binary) unless a developer already has one running on + // :10000, in which case we attach and the restart tests are skipped. The + // multiprocess topology starts the services as subprocesses — host or inside + // the test container — so the lifecycle tests can restart them too. + nodeTopology = envOrDefault("NODE_TOPOLOGY", "standalone") + healthTimeout := 2 * time.Minute + + mustPrepareRuntime := func() string { + logPath, err := prepareNodeRuntime() if err != nil { - fmt.Fprintf(os.Stderr, "failed to prepare integration artifacts dir: %v\n", err) + fmt.Fprintf(os.Stderr, "failed to prepare node runtime: %v\n", err) os.Exit(1) } - os.Setenv("CARTESI_TEST_ARTIFACTS_DIR", artifactsDir) - os.Setenv("CARTESI_TEST_NODE_WORKDIR", artifactsDir) - fmt.Fprintf(os.Stderr, "Integration artifacts dir: %s\n", artifactsDir) - - // `make env` exports CARTESI_SNAPSHOTS_DIR=snapshots, which used to - // resolve under test/integration because the node inherited go test's - // package cwd. Keep user-provided custom paths, but route the default - // snapshot path into the integration artifacts directory. - if snapshotsDir := os.Getenv("CARTESI_SNAPSHOTS_DIR"); snapshotsDir == "" || snapshotsDir == "snapshots" { - os.Setenv("CARTESI_SNAPSHOTS_DIR", filepath.Join(artifactsDir, "snapshots")) - } - - logPath := os.Getenv("CARTESI_TEST_NODE_LOG_FILE") - if logPath == "" { - f, err := os.CreateTemp("", "rollups-node-integration-*.log") - if err != nil { - fmt.Fprintf(os.Stderr, - "failed to create node log file: %v\n", err) - os.Exit(1) - } - logPath = f.Name() - f.Close() - os.Setenv("CARTESI_TEST_NODE_LOG_FILE", logPath) - } - - fmt.Fprintf(os.Stderr, "Starting node (log: %s)...\n", logPath) + return logPath + } - sharedNode, err = startNodeWithLog(logPath) + bringUp := func(h nodeHandle, err error) { if err != nil { fmt.Fprintf(os.Stderr, "failed to start node: %v\n", err) os.Exit(1) } - - ctx, cancel := context.WithTimeout( - context.Background(), 2*time.Minute) - if err := sharedNode.waitForHealth(ctx, nil); err != nil { + ctx, cancel := context.WithTimeout(context.Background(), healthTimeout) + if err := h.waitForHealth(ctx, nil); err != nil { cancel() - sharedNode.stop(nil) - fmt.Fprintf(os.Stderr, - "node failed to become healthy: %v\n", err) + h.stop(nil) + fmt.Fprintf(os.Stderr, "node failed to become healthy: %v\n", err) os.Exit(1) } cancel() + sharedNode = h fmt.Fprintln(os.Stderr, "Node is healthy. Running integration tests...") - } else { + } + + switch { + case nodeTopology == "multiprocess": + logPath := mustPrepareRuntime() + fmt.Fprintf(os.Stderr, "Starting multiprocess node (log: %s)...\n", logPath) + bringUp(startMultiNode(logPath)) + case nodePortAvailable(): + logPath := mustPrepareRuntime() + fmt.Fprintf(os.Stderr, "Starting node (log: %s)...\n", logPath) + bringUp(startNodeWithLog(logPath)) + default: fmt.Fprintln(os.Stderr, - "Node already running on port 10000 (external). "+ - "Restart tests will be skipped.") + "Node already running on :10000 (external). Restart tests will be skipped.") } code := m.Run() @@ -115,6 +109,38 @@ func TestMain(m *testing.M) { os.Exit(code) } +// prepareNodeRuntime sets up the artifacts dir, snapshot dir, and node log +// file shared by the standalone and host-multiprocess paths, returning the +// log path. +func prepareNodeRuntime() (string, error) { + artifactsDir, err := integrationArtifactsDir() + if err != nil { + return "", fmt.Errorf("prepare integration artifacts dir: %w", err) + } + os.Setenv("CARTESI_TEST_ARTIFACTS_DIR", artifactsDir) + os.Setenv("CARTESI_TEST_NODE_WORKDIR", artifactsDir) + fmt.Fprintf(os.Stderr, "Integration artifacts dir: %s\n", artifactsDir) + + // `make env` exports CARTESI_SNAPSHOTS_DIR=snapshots, which used to resolve + // under test/integration because the node inherited go test's package cwd. + // Keep user-provided custom paths, but route the default into the artifacts dir. + if snapshotsDir := os.Getenv("CARTESI_SNAPSHOTS_DIR"); snapshotsDir == "" || snapshotsDir == "snapshots" { + os.Setenv("CARTESI_SNAPSHOTS_DIR", filepath.Join(artifactsDir, "snapshots")) + } + + logPath := os.Getenv("CARTESI_TEST_NODE_LOG_FILE") + if logPath == "" { + f, err := os.CreateTemp("", "rollups-node-integration-*.log") + if err != nil { + return "", fmt.Errorf("create node log file: %w", err) + } + logPath = f.Name() + f.Close() + os.Setenv("CARTESI_TEST_NODE_LOG_FILE", logPath) + } + return logPath, nil +} + func integrationArtifactsDir() (string, error) { if dir := os.Getenv("CARTESI_TEST_ARTIFACTS_DIR"); dir != "" { absDir, err := filepath.Abs(dir) diff --git a/test/integration/multinode_helpers_test.go b/test/integration/multinode_helpers_test.go new file mode 100644 index 000000000..72c302adf --- /dev/null +++ b/test/integration/multinode_helpers_test.go @@ -0,0 +1,172 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//go:build endtoendtests + +package integration + +import ( + "context" + "fmt" + "net/http" + "os" + "os/exec" + "testing" + "time" +) + +// multiService describes one service subprocess and the telemetry port +// whose /readyz endpoint reports its health. Each service keeps the +// read-API ports the tests already expect (jsonrpc :10011, inspect :10012), so +// only the telemetry ports are remapped to avoid collisions on localhost. +type multiService struct { + name string + telemetryAddr string + extraEnv []string +} + +// multiNodeServices is the full deployment the multiprocess topology runs on +// the host — every service the standalone node embeds. prt is included so the +// prt shard's Dave-consensus tests run; it idles for authority/quorum apps. +var multiNodeServices = []multiService{ + {name: "cartesi-rollups-evm-reader", telemetryAddr: ":10001", + extraEnv: []string{"CARTESI_EVM_READER_TELEMETRY_ADDRESS=:10001"}}, + {name: "cartesi-rollups-advancer", telemetryAddr: ":10002", + extraEnv: []string{ + "CARTESI_ADVANCER_TELEMETRY_ADDRESS=:10002", + "CARTESI_INSPECT_ADDRESS=:10012", + "CARTESI_FEATURE_INSPECT_ENABLED=true", + }}, + {name: "cartesi-rollups-validator", telemetryAddr: ":10003", + extraEnv: []string{"CARTESI_VALIDATOR_TELEMETRY_ADDRESS=:10003"}}, + {name: "cartesi-rollups-claimer", telemetryAddr: ":10004", + extraEnv: []string{"CARTESI_CLAIMER_TELEMETRY_ADDRESS=:10004"}}, + {name: "cartesi-rollups-jsonrpc-api", telemetryAddr: ":10005", + extraEnv: []string{ + "CARTESI_JSONRPC_TELEMETRY_ADDRESS=:10005", + "CARTESI_JSONRPC_API_ADDRESS=:10011", + }}, + {name: "cartesi-rollups-prt", telemetryAddr: ":10006", + extraEnv: []string{"CARTESI_PRT_TELEMETRY_ADDRESS=:10006"}}, +} + +// multiNode is a running host multiprocess deployment. +type multiNode struct { + procs []*exec.Cmd + addrs []string + logFile *os.File + tail *exec.Cmd // tail -f process streaming the log to the terminal + tty *os.File // /dev/tty FD used by tail; closed in stop() +} + +// startMultiNode starts each service binary as a host subprocess sharing the +// inherited environment (DB connection, blockchain endpoint, contracts, +// mnemonic, snapshot dir) plus per-service telemetry/address overrides and the +// fast polling intervals used for test responsiveness. All output is appended +// to logPath and streamed to the terminal, the same way the standalone node is. +// On any failure the already-started processes are stopped. +func startMultiNode(logPath string, extraEnv ...string) (*multiNode, error) { + logFile, err := os.OpenFile( //nolint:gosec + logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) //nolint:mnd + if err != nil { + return nil, fmt.Errorf("open log file %s: %w", logPath, err) + } + + mn := &multiNode{logFile: logFile} + + // Stream the combined service log to the terminal via a separate tail + // process writing to /dev/tty, bypassing go test / gotestsum output + // capture (same approach as the standalone node). Started before the + // services so their startup output is visible. Falls back silently when + // /dev/tty is unavailable (CI, compose). + if tty, ttyErr := os.OpenFile("/dev/tty", os.O_WRONLY, 0); ttyErr == nil { + tail := exec.Command("tail", "-f", logPath) //nolint:gosec + tail.Stdout = tty + tail.Stderr = tty + if err := tail.Start(); err != nil { + tty.Close() + } else { + mn.tail = tail + mn.tty = tty + } + } + + base := append(os.Environ(), + "CARTESI_ADVANCER_POLLING_INTERVAL=1", + "CARTESI_VALIDATOR_POLLING_INTERVAL=1", + "CARTESI_CLAIMER_POLLING_INTERVAL=1", + "CARTESI_EVM_READER_POLLING_INTERVAL=1", + "CARTESI_PRT_POLLING_INTERVAL=1", + ) + + for _, svc := range multiNodeServices { + if _, err := exec.LookPath(svc.name); err != nil { + mn.stop(nil) + return nil, fmt.Errorf("%s not found on PATH: %w", svc.name, err) + } + cmd := exec.Command(svc.name) //nolint:gosec + cmd.Stdout = logFile + cmd.Stderr = logFile + cmd.Env = append(append(append([]string{}, base...), svc.extraEnv...), extraEnv...) + if err := cmd.Start(); err != nil { + mn.stop(nil) + return nil, fmt.Errorf("start %s: %w", svc.name, err) + } + fmt.Fprintf(os.Stderr, " started %s (telemetry %s, pid %d)\n", + svc.name, svc.telemetryAddr, cmd.Process.Pid) + mn.procs = append(mn.procs, cmd) + mn.addrs = append(mn.addrs, svc.telemetryAddr) + } + return mn, nil +} + +// waitForHealth polls every service's /readyz until all respond 200 OK or the +// context is cancelled. +func (mn *multiNode) waitForHealth(ctx context.Context, _ testing.TB) error { + client := &http.Client{Timeout: 2 * time.Second} + for i, addr := range mn.addrs { + svc := multiNodeServices[i].name + url := "http://localhost" + addr + "/readyz" + err := pollUntil(ctx, 2*time.Second, func() (bool, error) { + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return false, nil + } + resp, err := client.Do(req) //nolint:gosec // url is a fixed localhost telemetry port + if err != nil { + return false, nil + } + defer resp.Body.Close() + return resp.StatusCode == http.StatusOK, nil + }) + if err != nil { + return fmt.Errorf("%s did not become healthy at %s: %w", svc, url, err) + } + fmt.Fprintf(os.Stderr, " %s healthy\n", svc) + } + return nil +} + +// stop interrupts every service subprocess (in reverse start order) and waits +// for it to exit, then stops the log tail and closes the log file. +func (mn *multiNode) stop(_ testing.TB) { + // Stop the tail first so it does not print shutdown noise. + if mn.tail != nil && mn.tail.Process != nil { + _ = mn.tail.Process.Kill() + _ = mn.tail.Wait() + } + if mn.tty != nil { + mn.tty.Close() + } + for i := len(mn.procs) - 1; i >= 0; i-- { + if cmd := mn.procs[i]; cmd.Process != nil { + _ = cmd.Process.Signal(os.Interrupt) + } + } + for _, cmd := range mn.procs { + _ = cmd.Wait() + } + if mn.logFile != nil { + mn.logFile.Close() + } +} diff --git a/test/integration/node_helpers_test.go b/test/integration/node_helpers_test.go index 1ef047447..c77736409 100644 --- a/test/integration/node_helpers_test.go +++ b/test/integration/node_helpers_test.go @@ -18,15 +18,26 @@ import ( const nodeBinary = "cartesi-rollups-node" -// sharedNode is the test-managed node process, started by TestMain when no -// external node is running. All test suites share this instance. Restart -// tests stop and restart it via stopSharedNode/startSharedNode. -// When nil, the node is externally managed (e.g., Docker Compose) and -// restart tests are skipped. -var sharedNode *nodeProcess - -// isNodeSelfManaged returns true if TestMain started the node process. -// When false, the node is externally managed and cannot be restarted. +// nodeHandle is a running test-managed node: either the standalone all-in-one +// process or the multiprocess set of service subprocesses. Both can be stopped +// and restarted, so the node-lifecycle tests run under either topology. +type nodeHandle interface { + stop(t testing.TB) + waitForHealth(ctx context.Context, t testing.TB) error +} + +// sharedNode is the test-managed node, started by TestMain unless an external +// node is already running on :10000. All suites share it; lifecycle tests stop +// and restart it via stopSharedNode/startSharedNode. nil when externally +// managed. Its concrete type follows nodeTopology. +var sharedNode nodeHandle + +// nodeTopology is the deployment TestMain manages ("standalone" or +// "multiprocess"); it selects what startSharedNode(WithEnv) restarts. +var nodeTopology string + +// isNodeSelfManaged returns true if TestMain started the node (so it can be +// restarted). False when an external node is already running. func isNodeSelfManaged() bool { return sharedNode != nil } @@ -51,26 +62,34 @@ func startSharedNode(t testing.TB) { // inject extra environment variables (e.g., // CARTESI_FEATURE_CLAIM_SUBMISSION_ENABLED=false to bring the node up in // reader mode for a single test phase). Restore default mode on test -// teardown by stopping the node and calling startSharedNode again. +// teardown by stopping the node and calling startSharedNode again. Under the +// multiprocess topology this starts/stops the whole service set. func startSharedNodeWithEnv(t testing.TB, extraEnv ...string) { if sharedNode != nil { t.Fatal("cannot start node: already running") } logPath := os.Getenv("CARTESI_TEST_NODE_LOG_FILE") - var err error - sharedNode, err = startNodeWithLog(logPath, extraEnv...) + var ( + h nodeHandle + err error + ) + if nodeTopology == "multiprocess" { + h, err = startMultiNode(logPath, extraEnv...) + } else { + h, err = startNodeWithLog(logPath, extraEnv...) + } if err != nil { t.Fatalf("failed to start node: %v", err) } ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel() - if err := sharedNode.waitForHealth(ctx, t); err != nil { - sharedNode.stop(t) - sharedNode = nil + if err := h.waitForHealth(ctx, t); err != nil { + h.stop(t) t.Fatalf("node failed to become healthy: %v", err) } + sharedNode = h } // nodePortAvailable returns true if the node's telemetry port (10000) is free.