diff --git a/yarn-project/archiver/src/archiver-sync.test.ts b/yarn-project/archiver/src/archiver-sync.test.ts index f75422e67168..acfa5c046c8d 100644 --- a/yarn-project/archiver/src/archiver-sync.test.ts +++ b/yarn-project/archiver/src/archiver-sync.test.ts @@ -72,7 +72,7 @@ describe('Archiver Sync', () => { // beforeEach default instance and by tests that need a second archiver with a different config. const buildArchiver = async ( storeName: string, - configOverrides: { skipOrphanProposedBlockPruning?: boolean } = {}, + configOverrides: { skipOrphanProposedBlockPruning?: boolean; batchSize?: number } = {}, ): Promise<{ archiver: Archiver; synchronizer: ArchiverL1Synchronizer; archiverStore: ArchiverDataStores }> => { const store = createArchiverDataStores(await openTmpStore(storeName), GENESIS_BLOCK_HEADER_HASH); @@ -920,6 +920,251 @@ describe('Archiver Sync', () => { expect(rejectedBad).toBeDefined(); expect(rejectedValid).toBeDefined(); }, 15_000); + + it('rejects a checkpoint with invalid attestations even when its blob data is malformed', async () => { + // Regression for A-1252: the archiver fetched and decoded checkpoint blobs before validating + // committee attestations. A checkpoint with BOTH invalid attestations and malformed blob data threw + // BlobDeserializationError during decode before the invalid-attestation skip path ran, so it was + // never recorded as rejected and sync looped on it forever (taking the valid CP1 in the same batch + // down with it). Attestations must be validated from calldata first, so the malformed blob is never + // fetched/decoded. + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(0)); + + // Committee of 3 signers. + fake.setTargetCommitteeSize(3); + const signers = times(3, Secp256k1Signer.random); + const committee = signers.map(signer => signer.address); + epochCache.getCommitteeForEpoch.mockResolvedValue({ committee } as EpochCommitteeInfo); + + const invalidCheckpointDetectedSpy = jest.fn(); + archiver.events.on(L2BlockSourceEvents.InvalidAttestationsCheckpointDetected, invalidCheckpointDetectedSpy); + + // Valid CP1 with correct attestations and well-formed blobs. + await fake.addCheckpoint(CheckpointNumber(1), { + l1BlockNumber: 70n, + messagesL1BlockNumber: 50n, + numL1ToL2Messages: 3, + signers, + }); + + // CP2 with BAD attestations (random signers not in committee). + const badSigners = times(3, Secp256k1Signer.random); + const { checkpoint: badCp2 } = await fake.addCheckpoint(CheckpointNumber(2), { + l1BlockNumber: 80n, + numL1ToL2Messages: 0, + signers: badSigners, + }); + + // Make ONLY CP2's blob malformed; CP1 keeps its real blobs. The default mock maps a blob sidecar to a + // checkpoint by its L1 block hash (Buffer32 of the L1 block number). + const cp2BlockId = Buffer32.fromBigInt(80n).toString(); + const malformedBlob = await makeRandomBlob(3); + const defaultGetBlobSidecar = blobClient.getBlobSidecar.getMockImplementation()!; + blobClient.getBlobSidecar.mockImplementation((...args: Parameters) => + args[0] === cp2BlockId ? Promise.resolve([malformedBlob]) : defaultGetBlobSidecar(...args), + ); + + fake.setL1BlockNumber(82n); + + // Must not throw: attestations are checked from calldata before the malformed CP2 blob is fetched. + await expect(archiver.syncImmediate()).resolves.toBeUndefined(); + + // CP1 syncs; CP2 is rejected for invalid attestations (not a blob-decode failure). + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(1)); + expect(invalidCheckpointDetectedSpy).toHaveBeenCalledWith( + expect.objectContaining({ + type: L2BlockSourceEvents.InvalidAttestationsCheckpointDetected, + validationResult: expect.objectContaining({ + valid: false, + checkpoint: expect.objectContaining({ checkpointNumber: 2 }), + }), + }), + ); + const rejected = await archiverStore.blocks.getRejectedCheckpointByArchiveRoot(badCp2.archive.root); + expect(rejected).toBeDefined(); + + // Repeated polling over the same L1 state stays stable. Without the fix, the malformed CP2 blob + // throws on every sync and the batch never commits -- even the valid CP1 stays unsynced and the + // archiver is stuck re-querying the same L1 blocks forever (the sync point never advances past the + // throw). With the fix, CP2 is rejected from calldata, CP1 is synced, and re-polling is a no-op. + for (let i = 0; i < 3; i++) { + await expect(archiver.syncImmediate()).resolves.toBeUndefined(); + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(1)); + } + }, 20_000); + + it('throws on a malformed blob with valid attestations while the epoch can still be proven', async () => { + // A-1252 rows 4/5 boundary: a checkpoint with VALID attestations but an unfetchable/undecodable blob + // must still be treated as fatal while its epoch can be proven (rollup cannot prune yet). The blob is + // canonical and must eventually become available, so we keep retrying rather than skipping it. + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(0)); + + fake.setTargetCommitteeSize(3); + const signers = times(3, Secp256k1Signer.random); + const committee = signers.map(signer => signer.address); + epochCache.getCommitteeForEpoch.mockResolvedValue({ committee } as EpochCommitteeInfo); + + // CP1 valid with well-formed blobs. + await fake.addCheckpoint(CheckpointNumber(1), { + l1BlockNumber: 70n, + messagesL1BlockNumber: 50n, + numL1ToL2Messages: 3, + signers, + }); + + // CP2 with VALID attestations (signed by the committee) but a malformed blob. + await fake.addCheckpoint(CheckpointNumber(2), { + l1BlockNumber: 80n, + numL1ToL2Messages: 0, + signers, + }); + + const cp2BlockId = Buffer32.fromBigInt(80n).toString(); + const malformedBlob = await makeRandomBlob(3); + const defaultGetBlobSidecar = blobClient.getBlobSidecar.getMockImplementation()!; + blobClient.getBlobSidecar.mockImplementation((...args: Parameters) => + args[0] === cp2BlockId ? Promise.resolve([malformedBlob]) : defaultGetBlobSidecar(...args), + ); + + // Rollup cannot prune yet: the epoch is still provable, so the malformed blob is fatal. + fake.setCanPrune(false); + fake.setL1BlockNumber(82n); + + await expect(archiver.syncImmediate()).rejects.toThrow(); + }, 20_000); + + it('skips a malformed-blob checkpoint and recovers once its epoch can be pruned', async () => { + // A-1252 rows 4/5: a bribed-committee checkpoint (valid attestations) whose blob is withheld would, + // before this fix, throw during blob decode on every iteration — freezing the L1 sync clock and + // halting every honest proposer so the prune that recovers the chain never fires. Once the proof + // window has expired (rollup can prune), the archiver must skip the unfetchable checkpoint, advance + // its clock, and let the epoch-prune machinery roll the pending chain back to the proven tip. + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(0)); + + fake.setTargetCommitteeSize(3); + const signers = times(3, Secp256k1Signer.random); + const committee = signers.map(signer => signer.address); + epochCache.getCommitteeForEpoch.mockResolvedValue({ committee } as EpochCommitteeInfo); + + // CP1 and CP2: valid attestations, well-formed blobs. Both sync normally. + const { checkpoint: cp1 } = await fake.addCheckpoint(CheckpointNumber(1), { + l1BlockNumber: 70n, + messagesL1BlockNumber: 50n, + numL1ToL2Messages: 3, + signers, + }); + await fake.addCheckpoint(CheckpointNumber(2), { + l1BlockNumber: 80n, + messagesL1BlockNumber: 60n, + numL1ToL2Messages: 3, + signers, + }); + + fake.setL1BlockNumber(90n); + await archiver.syncImmediate(); + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(2)); + const l1TimestampBefore = await archiver.getL1Timestamp(); + + // CP1 gets proven; CP2 sits in the unproven epoch. + fake.markCheckpointAsProven(CheckpointNumber(1)); + + // CP3 arrives with VALID attestations (committee-signed) but a withheld/undecodable blob. + await fake.addCheckpoint(CheckpointNumber(3), { + l1BlockNumber: 100n, + numL1ToL2Messages: 0, + signers, + }); + const cp3BlockId = Buffer32.fromBigInt(100n).toString(); + const malformedBlob = await makeRandomBlob(3); + const defaultGetBlobSidecar = blobClient.getBlobSidecar.getMockImplementation()!; + blobClient.getBlobSidecar.mockImplementation((...args: Parameters) => + args[0] === cp3BlockId ? Promise.resolve([malformedBlob]) : defaultGetBlobSidecar(...args), + ); + + const pruneSpy = jest.fn(); + archiver.events.on(L2BlockSourceEvents.L2PruneUnproven, pruneSpy); + + // Proof window expired: the rollup would prune on the next L1 block. + fake.setCanPrune(true); + fake.setL1BlockNumber(101n); + + // Must not throw: the unfetchable CP3 is in a prunable epoch, so it is skipped rather than fatal. + await expect(archiver.syncImmediate()).resolves.toBeUndefined(); + + // CP3 is never ingested, and the unproven CP2 is rolled back to the proven tip (CP1). + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(1)); + expect(await archiver.getProvenCheckpointNumber()).toEqual(CheckpointNumber(1)); + expect(await archiver.getCheckpoints({ from: CheckpointNumber(3), limit: 1 })).toEqual([]); + expect(pruneSpy).toHaveBeenCalled(); + + // The sync clock advanced rather than freezing on the blob error — honest proposers stay live. + const l1TimestampAfter = await archiver.getL1Timestamp(); + expect(l1TimestampAfter).toEqual(fake.getTimestampAtL1Block(101n)); + expect(l1TimestampAfter!).toBeGreaterThan(l1TimestampBefore!); + + // L2Tips reflect the rollback to checkpoint 1. + const tips = await archiver.getL2Tips(); + expect(tips.checkpointed.checkpoint.number).toEqual(CheckpointNumber(1)); + expect(tips.checkpointed.block.number).toEqual(cp1.blocks[cp1.blocks.length - 1].number); + + archiver.events.off(L2BlockSourceEvents.L2PruneUnproven, pruneSpy); + }, 20_000); + + it('does not ingest a later-batch checkpoint that builds on a skipped prunable one', async () => { + // Covers the loop-break (stopAfterBatch) in handleCheckpoints, distinct from the in-batch filter: + // once a prunable blob failure skips checkpoint N, every later checkpoint this iteration must be + // skipped too — including ones that land in a *later* L1-block batch. If such a checkpoint (valid + // attestations, fetchable blob, but building on the skipped N) were pulled in, addCheckpoints would + // throw InitialCheckpointNumberNotSequentialError on the gap and re-freeze sync. We use a tiny batch + // size so CP2 (bad blob) and CP3 (good blob, builds on CP2) fall in separate batches. + const { archiver: smallBatchArchiver } = await buildArchiver('archiver_small_batch', { batchSize: 1 }); + try { + fake.setTargetCommitteeSize(3); + const signers = times(3, Secp256k1Signer.random); + epochCache.getCommitteeForEpoch.mockResolvedValue({ + committee: signers.map(s => s.address), + } as EpochCommitteeInfo); + + // CP1: valid attestations + good blob. Synced and proven — the tip we expect to roll back to. + await fake.addCheckpoint(CheckpointNumber(1), { + l1BlockNumber: 2n, + messagesL1BlockNumber: 1n, + numL1ToL2Messages: 3, + signers, + }); + fake.setL1BlockNumber(3n); + await smallBatchArchiver.syncImmediate(); + expect(await smallBatchArchiver.getCheckpointNumber()).toEqual(CheckpointNumber(1)); + fake.markCheckpointAsProven(CheckpointNumber(1)); + + // CP2 (valid attestations, malformed blob) and CP3 (valid attestations, good blob, chains off CP2), + // spaced >2 L1 blocks apart so they land in separate batches given batchSize 1 (2 L1 blocks/batch). + await fake.addCheckpoint(CheckpointNumber(2), { l1BlockNumber: 5n, numL1ToL2Messages: 0, signers }); + await fake.addCheckpoint(CheckpointNumber(3), { l1BlockNumber: 8n, numL1ToL2Messages: 0, signers }); + const cp2BlockId = Buffer32.fromBigInt(5n).toString(); + const malformedBlob = await makeRandomBlob(3); + const defaultGetBlobSidecar = blobClient.getBlobSidecar.getMockImplementation()!; + blobClient.getBlobSidecar.mockImplementation((...args: Parameters) => + args[0] === cp2BlockId ? Promise.resolve([malformedBlob]) : defaultGetBlobSidecar(...args), + ); + + fake.setCanPrune(true); + fake.setL1BlockNumber(10n); + + // Must not throw: CP2 is skipped as prunable, and CP3 (in a later batch, building on CP2) must not + // be pulled in — otherwise its ingestion would hit the sequential-number gap and rethrow. + await expect(smallBatchArchiver.syncImmediate()).resolves.toBeUndefined(); + + // Neither CP2 nor CP3 ingested; the chain stayed at the proven tip CP1. + expect(await smallBatchArchiver.getCheckpointNumber()).toEqual(CheckpointNumber(1)); + expect(await smallBatchArchiver.getProvenCheckpointNumber()).toEqual(CheckpointNumber(1)); + expect(await smallBatchArchiver.getCheckpoints({ from: CheckpointNumber(2), limit: 2 })).toEqual([]); + + blobClient.getBlobSidecar.mockImplementation(defaultGetBlobSidecar); + } finally { + await smallBatchArchiver.stop(); + } + }, 20_000); }); describe('reorg handling', () => { @@ -1666,6 +1911,64 @@ describe('Archiver Sync', () => { expect(checkpointedBlocks[0].checkpointNumber).toEqual(2); }, 10_000); + it('promotes a matching local checkpoint even when its on-chain blob is malformed', async () => { + // A-1252 row 2: a checkpoint with a withheld/malformed blob is immune to the blob-decode stall when a + // matching local proposed copy exists, because it is promoted from local blocks and the blob fetch is + // skipped entirely. This must hold regardless of the blob being unfetchable. + await fake.addCheckpoint(CheckpointNumber(1), { + l1BlockNumber: 70n, + messagesL1BlockNumber: 60n, + numL1ToL2Messages: 3, + }); + + fake.setL1BlockNumber(100n); + await archiver.syncImmediate(); + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(1)); + + // Checkpoint 2 on L1 at a far-future block, with a malformed blob that would throw if ever fetched. + const { checkpoint: cp2 } = await fake.addCheckpoint(CheckpointNumber(2), { + l1BlockNumber: 5000n, + messagesL1BlockNumber: 4990n, + numL1ToL2Messages: 3, + }); + const cp2BlockId = Buffer32.fromBigInt(5000n).toString(); + const malformedBlob = await makeRandomBlob(3); + const defaultGetBlobSidecar = blobClient.getBlobSidecar.getMockImplementation()!; + blobClient.getBlobSidecar.mockImplementation((...args: Parameters) => + args[0] === cp2BlockId ? Promise.resolve([malformedBlob]) : defaultGetBlobSidecar(...args), + ); + + // Register checkpoint 2's blocks and a matching proposed checkpoint directly on the store, so the + // archiver has a local copy to promote (the archive root is computed from the stored blocks). We go + // through the store rather than archiver.addBlock/addProposedCheckpoint to avoid those methods firing + // background sync runs that would race with the explicit sync below. + for (const block of cp2.blocks) { + await archiverStore.blocks.addProposedBlock(block); + } + await archiverStore.blocks.addProposedCheckpoint({ + checkpointNumber: CheckpointNumber(2), + header: cp2.header, + startBlock: cp2.blocks[0].number, + blockCount: cp2.blocks.length, + totalManaUsed: 0n, + feeAssetPriceModifier: cp2.feeAssetPriceModifier, + }); + + blobClient.getBlobSidecar.mockClear(); + + fake.setL1BlockNumber(5010n); + await expect(archiver.syncImmediate()).resolves.toBeUndefined(); + + // Checkpoint 2 is ingested via promotion; its malformed blob was never fetched. + expect(await archiver.getCheckpointNumber()).toEqual(CheckpointNumber(2)); + expect(pruneSpy).not.toHaveBeenCalled(); + expect(blobClient.getBlobSidecar).not.toHaveBeenCalledWith(cp2BlockId, expect.anything(), expect.anything()); + + const tips = await archiver.getL2Tips(); + expect(tips.checkpointed.checkpoint.number).toEqual(CheckpointNumber(2)); + expect(tips.checkpointed.block.number).toEqual(cp2.blocks[cp2.blocks.length - 1].number); + }, 10_000); + it('rejects adding blocks that are already checkpointed', async () => { // First, sync checkpoint 1 from L1 to establish a baseline const { checkpoint: cp1 } = await fake.addCheckpoint(CheckpointNumber(1), { diff --git a/yarn-project/archiver/src/modules/l1_synchronizer.ts b/yarn-project/archiver/src/modules/l1_synchronizer.ts index fd61b33a963b..227b8d705a59 100644 --- a/yarn-project/archiver/src/modules/l1_synchronizer.ts +++ b/yarn-project/archiver/src/modules/l1_synchronizer.ts @@ -1,4 +1,5 @@ import type { BlobClientInterface } from '@aztec/blob-client/client'; +import { BlobDeserializationError } from '@aztec/blob-lib'; import { EpochCache } from '@aztec/epoch-cache'; import { InboxContract, type InboxContractState, RollupContract } from '@aztec/ethereum/contracts'; import type { L1BlockId } from '@aztec/ethereum/l1-types'; @@ -17,13 +18,19 @@ import { count } from '@aztec/foundation/string'; import { DateProvider, Timer, elapsed } from '@aztec/foundation/timer'; import { isDefined, isErrorClass } from '@aztec/foundation/types'; import { type ArchiverEmitter, L2BlockSourceEvents, type ValidateCheckpointResult } from '@aztec/stdlib/block'; -import { Checkpoint, type CheckpointData, PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; +import { + Checkpoint, + type CheckpointData, + type CheckpointInfo, + type L1PublishedData, + PublishedCheckpoint, +} from '@aztec/stdlib/checkpoint'; import { type L1RollupConstants, getEpochAtSlot, getSlotAtNextL1Block } from '@aztec/stdlib/epoch-helpers'; import { computeInHashFromL1ToL2Messages } from '@aztec/stdlib/messaging'; import type { CoordinationSignatureContext } from '@aztec/stdlib/p2p'; import { type Traceable, type Tracer, execInSpan, trackSpan } from '@aztec/telemetry-client'; -import { InitialCheckpointNumberNotSequentialError } from '../errors.js'; +import { InitialCheckpointNumberNotSequentialError, NoBlobBodiesFoundError } from '../errors.js'; import { type RetrievedCheckpointFromCalldata, getCheckpointBlobDataFromBlobs, @@ -39,7 +46,7 @@ import { MessageStoreError } from '../store/message_store.js'; import type { InboxMessage } from '../structs/inbox_message.js'; import { ArchiverDataStoreUpdater } from './data_store_updater.js'; import type { ArchiverInstrumentation } from './instrumentation.js'; -import { validateCheckpointAttestations } from './validation.js'; +import { validateCheckpointAttestationsFromCalldata } from './validation.js'; type RollupStatus = { provenCheckpointNumber: CheckpointNumber; @@ -50,9 +57,18 @@ type RollupStatus = { /** Last valid checkpoint observed on L1 and synced on this iteration */ lastRetrievedCheckpoint?: PublishedCheckpoint; /** Last checkpoint observed on L1 across both valid and rejected entries on this iteration */ - lastSeenCheckpoint?: PublishedCheckpoint; + lastSeenCheckpoint?: { checkpointNumber: CheckpointNumber; l1: L1PublishedData }; }; +/** + * Outcome of fetching and building a single checkpoint from its blobs. A blob that is missing or + * undecodable yields a `blobError` sentinel instead of rejecting the whole fetch pool, so the caller + * can decide in checkpoint order whether the failure is fatal. See A-1252 (rows 4/5). + */ +type BlobFetchOutcome = + | { checkpoint: RetrievedCheckpointFromCalldata; published: PublishedCheckpoint } + | { checkpoint: RetrievedCheckpointFromCalldata; blobError: NoBlobBodiesFoundError | BlobDeserializationError }; + /** * Handles L1 synchronization for the archiver. * Responsible for fetching checkpoints, L1 to L2 messages, and handling L1 reorgs. @@ -189,7 +205,12 @@ export class ArchiverL1Synchronizer implements Traceable { if (currentL1BlockNumber > blocksSynchedTo) { // First we retrieve new checkpoints and L2 blocks and store them in the DB. This will also update the // pending chain validation status, proven checkpoint number, and synched L1 block number. - const rollupStatus = await this.handleCheckpoints(blocksSynchedTo, currentL1BlockNumber, initialSyncComplete); + const rollupStatus = await this.handleCheckpoints( + blocksSynchedTo, + currentL1BlockNumber, + currentL1Timestamp, + initialSyncComplete, + ); // Then we try pruning uncheckpointed blocks if a new slot was mined without checkpoints await this.pruneUncheckpointedBlocks(currentL1Timestamp); @@ -614,6 +635,7 @@ export class ArchiverL1Synchronizer implements Traceable { private async handleCheckpoints( blocksSynchedTo: bigint, currentL1BlockNumber: bigint, + currentL1Timestamp: bigint, initialSyncComplete: boolean, ): Promise { const localPendingCheckpointNumber = await this.stores.blocks.getLatestCheckpointNumber(); @@ -790,7 +812,7 @@ export class ArchiverL1Synchronizer implements Traceable { let searchStartBlock: bigint = blocksSynchedTo; let searchEndBlock: bigint = blocksSynchedTo; let lastRetrievedCheckpoint: PublishedCheckpoint | undefined; - let lastSeenCheckpoint: PublishedCheckpoint | undefined; + let lastSeenCheckpoint: { checkpointNumber: CheckpointNumber; l1: L1PublishedData } | undefined; do { [searchStartBlock, searchEndBlock] = this.nextRange(searchEndBlock, currentL1BlockNumber); @@ -835,37 +857,21 @@ export class ArchiverL1Synchronizer implements Traceable { const evictProposedFrom = promoteResult && 'diverged' in promoteResult ? promoteResult.fromCheckpointNumber : undefined; - // Then fetch blobs in parallel and build the full published checkpoints - const toFetchBlobs = checkpointToPromote ? calldataCheckpoints.slice(0, -1) : calldataCheckpoints; - const blobFetched = await asyncPool(10, toFetchBlobs, async checkpoint => - retrievedToPublishedCheckpoint({ - ...checkpoint, - checkpointBlobData: await getCheckpointBlobDataFromBlobs( - this.blobClient, - checkpoint.l1.blockHash, - checkpoint.blobHashes, - checkpoint.checkpointNumber, - this.log, - !initialSyncComplete, - checkpoint.parentBeaconBlockRoot, - checkpoint.l1.timestamp, - ), - }), - ); + // Validate attestations from CALLDATA before fetching any blobs. A checkpoint with invalid + // attestations (or one descending from a rejected ancestor) is rejected here without fetching its + // blobs, so a malformed blob does not throw during decode before the rejection path runs and + // stall sync. The signed consensus payload (header, archive root, fee asset price + // modifier) is fully available from calldata. + let checkpointsToIngest: RetrievedCheckpointFromCalldata[] = []; - // And add the promoted checkpoint to the list of all checkpoints - const publishedCheckpoints = checkpointToPromote ? [...blobFetched, checkpointToPromote] : blobFetched; - const validCheckpoints: PublishedCheckpoint[] = []; - - // Now loop through all checkpoints and validate their attestations - for (const published of publishedCheckpoints) { - // Check the attestations uploaded by the publisher to L1 are correct + for (const calldataCheckpoint of calldataCheckpoints) { + // Check the attestations uploaded by the publisher to L1 are correct. // Rollup contract does not validate attestations to save on gas, so this // falls on the nodes to verify offchain and skip those checkpoints. const validationResult = this.config.skipValidateCheckpointAttestations ? { valid: true as const } - : await validateCheckpointAttestations( - published, + : await validateCheckpointAttestationsFromCalldata( + calldataCheckpoint, this.epochCache, this.l1Constants, this.getSignatureContext(), @@ -877,7 +883,7 @@ export class ArchiverL1Synchronizer implements Traceable { // ancestor was skipped earlier (e.g. due to invalid attestations), the catch handler // would roll back the L1 sync point, and the next iteration would re-fetch and re-throw. const rejectedAncestor = await this.stores.blocks.getRejectedCheckpointByArchiveRoot( - published.checkpoint.header.lastArchiveRoot, + calldataCheckpoint.header.lastArchiveRoot, ); // Update the validation result if it has changed, so we can keep track of the first invalid checkpoint @@ -899,9 +905,9 @@ export class ArchiverL1Synchronizer implements Traceable { } if (!validationResult.valid) { - this.log.warn(`Skipping checkpoint ${published.checkpoint.number} due to invalid attestations`, { - checkpointHash: published.checkpoint.hash(), - l1BlockNumber: published.l1.blockNumber, + this.log.warn(`Skipping checkpoint ${calldataCheckpoint.checkpointNumber} due to invalid attestations`, { + checkpointNumber: calldataCheckpoint.checkpointNumber, + l1BlockNumber: calldataCheckpoint.l1.blockNumber, ...pick(validationResult, 'reason'), }); @@ -915,11 +921,11 @@ export class ArchiverL1Synchronizer implements Traceable { // is detected and skipped (rather than tripping the addCheckpoints consecutive-number // check and causing the sync point to roll back in a loop). await this.stores.blocks.addRejectedCheckpoint({ - checkpointNumber: published.checkpoint.number, - archiveRoot: published.checkpoint.archive.root, - parentArchiveRoot: published.checkpoint.header.lastArchiveRoot, - slotNumber: published.checkpoint.header.slotNumber, - l1: published.l1, + checkpointNumber: calldataCheckpoint.checkpointNumber, + archiveRoot: calldataCheckpoint.archiveRoot, + parentArchiveRoot: calldataCheckpoint.header.lastArchiveRoot, + slotNumber: calldataCheckpoint.header.slotNumber, + l1: calldataCheckpoint.l1, reason: 'invalid-attestations' as const, }); @@ -927,15 +933,20 @@ export class ArchiverL1Synchronizer implements Traceable { } if (rejectedAncestor) { - const descendantInfo = published.checkpoint.toCheckpointInfo(); + const descendantInfo: CheckpointInfo = { + archive: calldataCheckpoint.archiveRoot, + lastArchive: calldataCheckpoint.header.lastArchiveRoot, + slotNumber: calldataCheckpoint.header.slotNumber, + checkpointNumber: calldataCheckpoint.checkpointNumber, + timestamp: calldataCheckpoint.header.timestamp, + }; this.log.warn( - `Skipping checkpoint ${published.checkpoint.number} as it is a descendant of ` + + `Skipping checkpoint ${calldataCheckpoint.checkpointNumber} as it is a descendant of ` + `rejected checkpoint ${rejectedAncestor.checkpointNumber} (${rejectedAncestor.reason})`, { - checkpointNumber: published.checkpoint.number, - checkpointHash: published.checkpoint.hash(), - l1BlockNumber: published.l1.blockNumber, - l1BlockHash: published.l1.blockHash, + checkpointNumber: calldataCheckpoint.checkpointNumber, + l1BlockNumber: calldataCheckpoint.l1.blockNumber, + l1BlockHash: calldataCheckpoint.l1.blockHash, ancestorCheckpointNumber: rejectedAncestor.checkpointNumber, ancestorArchiveRoot: rejectedAncestor.archiveRoot.toString(), ancestorReason: rejectedAncestor.reason, @@ -952,17 +963,102 @@ export class ArchiverL1Synchronizer implements Traceable { // Persist this chainpoint as rejected as well, so we can construct a chain of // skipped checkpoints starting from the first one with invalid attestations. await this.stores.blocks.addRejectedCheckpoint({ - checkpointNumber: published.checkpoint.number, - archiveRoot: published.checkpoint.archive.root, - parentArchiveRoot: published.checkpoint.header.lastArchiveRoot, - slotNumber: published.checkpoint.header.slotNumber, - l1: published.l1, + checkpointNumber: calldataCheckpoint.checkpointNumber, + archiveRoot: calldataCheckpoint.archiveRoot, + parentArchiveRoot: calldataCheckpoint.header.lastArchiveRoot, + slotNumber: calldataCheckpoint.header.slotNumber, + l1: calldataCheckpoint.l1, reason: 'descends-from-invalid-attestations' as const, }); continue; } + checkpointsToIngest.push(calldataCheckpoint); + } + + // Fetch blobs in parallel only for the surviving (attestation-valid, non-descendant) checkpoints, + // then build the full published checkpoints. The last calldata checkpoint may be promotable from a + // local proposed block (checkpointToPromote), in which case it carries no blob to fetch. + const toFetchBlobs = checkpointToPromote + ? checkpointsToIngest.filter(c => c.checkpointNumber !== checkpointToPromote.checkpoint.number) + : checkpointsToIngest; + + // Fetch blobs concurrently, but tolerate per-checkpoint blob failures instead of rejecting the whole + // pool: a missing/undecodable blob yields a sentinel so we can decide, in checkpoint order, whether it + // is fatal. + const blobResults = await asyncPool(10, toFetchBlobs, async (checkpoint): Promise => { + try { + const checkpointBlobData = await getCheckpointBlobDataFromBlobs( + this.blobClient, + checkpoint.l1.blockHash, + checkpoint.blobHashes, + checkpoint.checkpointNumber, + this.log, + !initialSyncComplete, + checkpoint.parentBeaconBlockRoot, + checkpoint.l1.timestamp, + ); + return { checkpoint, published: await retrievedToPublishedCheckpoint({ ...checkpoint, checkpointBlobData }) }; + } catch (err) { + if (err instanceof NoBlobBodiesFoundError || err instanceof BlobDeserializationError) { + return { checkpoint, blobError: err }; + } + throw err; + } + }); + + // A blob fetch/decode failure is only fatal while the checkpoint's epoch can still be proven. Once the + // proof-submission window has expired (the rollup can prune on the next L1 block), the checkpoint is + // destined for pruning, so we stop treating it as fatal: we skip it (and every later checkpoint) and + // let the epoch-prune recovery proceed. + const firstBlobFailure = blobResults + .flatMap(r => ('blobError' in r ? [r] : [])) + .sort((a, b) => a.checkpoint.checkpointNumber - b.checkpoint.checkpointNumber)[0]; + + let stopAfterBatch = false; + if (firstBlobFailure) { + const failedNumber = firstBlobFailure.checkpoint.checkpointNumber; + if (!(await this.canPrune(currentL1BlockNumber, currentL1Timestamp))) { + // The checkpoint is canonical and may still be proven, so the blob must eventually become + // available. Rethrow to retry on the next iteration. + this.log.error( + `Failed to fetch blob for checkpoint ${failedNumber} whose epoch can still be proven; will retry`, + { checkpointNumber: failedNumber, l1BlockNumber: firstBlobFailure.checkpoint.l1.blockNumber }, + ); + throw firstBlobFailure.blobError; + } + + this.log.warn( + `Skipping checkpoint ${failedNumber} and any later checkpoints this iteration due to an unfetchable ` + + `blob in a prunable epoch; deferring to epoch-prune recovery`, + { + checkpointNumber: failedNumber, + l1BlockNumber: firstBlobFailure.checkpoint.l1.blockNumber, + reason: firstBlobFailure.blobError.message, + }, + ); + + // Drop the failed checkpoint and every later one in this batch from ingestion (they belong to the + // doomed unproven epoch), and stop the batch loop so we do not pull in still-later checkpoints. + checkpointsToIngest = checkpointsToIngest.filter(c => c.checkpointNumber < failedNumber); + stopAfterBatch = true; + } + + // Index the successfully-built checkpoints by number so we can ingest them in calldata order, slotting + // in the promoted checkpoint (built from a local proposed block rather than blobs). + const publishedCheckpoints = blobResults.flatMap(r => ('published' in r ? [r.published] : [])); + const publishedByNumber = new Map( + publishedCheckpoints.map(published => [published.checkpoint.number, published]), + ); + if (checkpointToPromote) { + publishedByNumber.set(checkpointToPromote.checkpoint.number, checkpointToPromote); + } + + const validCheckpoints: PublishedCheckpoint[] = []; + for (const calldataCheckpoint of checkpointsToIngest) { + const published = publishedByNumber.get(calldataCheckpoint.checkpointNumber)!; + // Check the inHash of the checkpoint against the l1->l2 messages. // The messages should've been synced up to the currentL1BlockNumber and must be available for the published // checkpoints we just retrieved. @@ -1095,7 +1191,17 @@ export class ArchiverL1Synchronizer implements Traceable { }); } lastRetrievedCheckpoint = validCheckpoints.at(-1) ?? lastRetrievedCheckpoint; - lastSeenCheckpoint = publishedCheckpoints.at(-1) ?? lastSeenCheckpoint; + // The last checkpoint seen on L1 this batch (valid or rejected), tracked from calldata since + // rejected checkpoints are no longer built into PublishedCheckpoints. + lastSeenCheckpoint = { + checkpointNumber: lastCalldataCheckpoint.checkpointNumber, + l1: lastCalldataCheckpoint.l1, + }; + + // A prunable blob failure means every later checkpoint is in the doomed epoch; stop fetching them. + if (stopAfterBatch) { + break; + } } while (searchEndBlock < currentL1BlockNumber); // Important that we update AFTER inserting the blocks. @@ -1205,7 +1311,7 @@ export class ArchiverL1Synchronizer implements Traceable { // Compare the last checkpoint (valid or not) we have (either retrieved in this round or loaded from store) // with what the rollup contract told us was the latest one (pinned at the currentL1BlockNumber). const latestLocalCheckpointNumber = - lastSeenCheckpoint?.checkpoint.number ?? + lastSeenCheckpoint?.checkpointNumber ?? CheckpointNumber.max( await this.stores.blocks.getLatestCheckpointNumber(), await this.stores.blocks.getLatestRejectedCheckpointNumber(), @@ -1218,7 +1324,11 @@ export class ArchiverL1Synchronizer implements Traceable { // We suspect an L1 reorg that added checkpoints *behind* us. If that is the case, it must have happened between // the last checkpoint we saw and the current one, so we reset the last synched L1 block number. In the edge case // we don't have one, we go back 2 L1 epochs, which is the deepest possible reorg (assuming Casper is working). - const latestLocalCheckpoint: PublishedCheckpoint | CheckpointData | RejectedCheckpoint | undefined = + const latestLocalCheckpoint: + | { checkpointNumber: CheckpointNumber; l1: L1PublishedData } + | CheckpointData + | RejectedCheckpoint + | undefined = lastSeenCheckpoint ?? (await this.stores.blocks.getCheckpointData(latestLocalCheckpointNumber)) ?? (await this.stores.blocks.getRejectedCheckpointByNumber(latestLocalCheckpointNumber)); diff --git a/yarn-project/archiver/src/modules/validation.ts b/yarn-project/archiver/src/modules/validation.ts index ffb6c0f57c7f..e53d70ece19f 100644 --- a/yarn-project/archiver/src/modules/validation.ts +++ b/yarn-project/archiver/src/modules/validation.ts @@ -1,16 +1,19 @@ import type { EpochCache } from '@aztec/epoch-cache'; -import { EpochNumber } from '@aztec/foundation/branded-types'; +import { type CheckpointNumber, EpochNumber } from '@aztec/foundation/branded-types'; import { compactArray } from '@aztec/foundation/collection'; +import type { Fr } from '@aztec/foundation/curves/bn254'; import type { Logger } from '@aztec/foundation/log'; import { type AttestationInfo, + type CommitteeAttestation, type ValidateCheckpointNegativeResult, type ValidateCheckpointResult, getAttestationInfoFromPayload, } from '@aztec/stdlib/block'; -import type { PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; +import type { CheckpointInfo, PublishedCheckpoint } from '@aztec/stdlib/checkpoint'; import { type L1RollupConstants, computeQuorum, getEpochAtSlot } from '@aztec/stdlib/epoch-helpers'; import { ConsensusPayload, type CoordinationSignatureContext } from '@aztec/stdlib/p2p'; +import type { CheckpointHeader } from '@aztec/stdlib/rollup'; export type { ValidateCheckpointResult }; @@ -27,27 +30,83 @@ export function getAttestationInfoFromPublishedCheckpoint( } /** - * Validates the attestations submitted for the given checkpoint. + * Validates the attestations of a checkpoint already retrieved (with its blocks) from blobs. * Returns true if the attestations are valid and sufficient, false otherwise. */ -export async function validateCheckpointAttestations( +export function validateCheckpointAttestations( publishedCheckpoint: PublishedCheckpoint, epochCache: EpochCache, constants: Pick, signatureContext: CoordinationSignatureContext, logger?: Logger, ): Promise { - const attestorInfos = getAttestationInfoFromPublishedCheckpoint(publishedCheckpoint, signatureContext); - const attestors = compactArray(attestorInfos.map(info => ('address' in info ? info.address : undefined))); const { checkpoint, attestations } = publishedCheckpoint; - const headerHash = checkpoint.header.hash(); - const archiveRoot = checkpoint.archive.root.toString(); - const slot = checkpoint.header.slotNumber; + const payload = ConsensusPayload.fromCheckpoint(checkpoint, signatureContext); + return validateAttestations(payload, attestations, checkpoint.toCheckpointInfo(), epochCache, constants, logger); +} + +/** The subset of a calldata-only checkpoint needed to validate its committee attestations. */ +export type CalldataCheckpointForAttestations = { + checkpointNumber: CheckpointNumber; + archiveRoot: Fr; + feeAssetPriceModifier: bigint; + header: CheckpointHeader; + attestations: CommitteeAttestation[]; +}; + +/** + * Validates the attestations of a checkpoint from L1 calldata only, without fetching or decoding its blobs. + * The signed consensus payload (header, archive root, fee asset price modifier) is fully available from + * calldata, so an invalid-attestation checkpoint can be rejected before any (possibly malformed) blob is + * fetched and decoded. See A-1252. + */ +export function validateCheckpointAttestationsFromCalldata( + checkpoint: CalldataCheckpointForAttestations, + epochCache: EpochCache, + constants: Pick, + signatureContext: CoordinationSignatureContext, + logger?: Logger, +): Promise { + const payload = new ConsensusPayload( + checkpoint.header, + checkpoint.archiveRoot, + checkpoint.feeAssetPriceModifier, + signatureContext, + ); + const checkpointInfo: CheckpointInfo = { + archive: checkpoint.archiveRoot, + lastArchive: checkpoint.header.lastArchiveRoot, + slotNumber: checkpoint.header.slotNumber, + checkpointNumber: checkpoint.checkpointNumber, + timestamp: checkpoint.header.timestamp, + }; + return validateAttestations(payload, checkpoint.attestations, checkpointInfo, epochCache, constants, logger); +} + +/** + * Core attestation validation over a consensus payload, its attestations, and checkpoint metadata -- + * independent of whether the checkpoint's blocks have been decoded from blobs. Returns true if the + * attestations are valid and sufficient, false otherwise. + */ +async function validateAttestations( + payload: ConsensusPayload, + attestations: CommitteeAttestation[], + checkpointInfo: CheckpointInfo, + epochCache: EpochCache, + constants: Pick, + logger?: Logger, +): Promise { + const attestorInfos = getAttestationInfoFromPayload(payload, attestations); + const attestors = compactArray(attestorInfos.map(info => ('address' in info ? info.address : undefined))); + const headerHash = payload.header.hash(); + const archiveRoot = payload.archive.toString(); + const slot = payload.header.slotNumber; + const checkpointNumber = checkpointInfo.checkpointNumber; const epoch: EpochNumber = getEpochAtSlot(slot, constants); const { committee, seed } = await epochCache.getCommitteeForEpoch(epoch); - const logData = { checkpointNumber: checkpoint.number, slot, epoch, headerHash, archiveRoot }; + const logData = { checkpointNumber, slot, epoch, headerHash, archiveRoot }; - logger?.debug(`Validating attestations for checkpoint ${checkpoint.number} at slot ${slot} in epoch ${epoch}`, { + logger?.debug(`Validating attestations for checkpoint ${checkpointNumber} at slot ${slot} in epoch ${epoch}`, { committee: (committee ?? []).map(member => member.toString()), recoveredAttestors: attestorInfos, postedAttestations: attestations.map(a => (a.address.isZero() ? a.signature : a.address).toString()), @@ -72,7 +131,7 @@ export async function validateCheckpointAttestations( const failedValidationResult = (reason: TReason) => ({ valid: false as const, reason, - checkpoint: checkpoint.toCheckpointInfo(), + checkpoint: checkpointInfo, committee, seed, epoch, @@ -123,7 +182,7 @@ export async function validateCheckpointAttestations( } logger?.debug( - `Checkpoint attestations validated successfully for checkpoint ${checkpoint.number} at slot ${slot}`, + `Checkpoint attestations validated successfully for checkpoint ${checkpointNumber} at slot ${slot}`, logData, ); return { valid: true }; diff --git a/yarn-project/end-to-end/src/e2e_epochs/epochs_invalidate_block.parallel.test.ts b/yarn-project/end-to-end/src/e2e_epochs/epochs_invalidate_block.parallel.test.ts index 939ecc2fe1aa..fc7e64dc2a60 100644 --- a/yarn-project/end-to-end/src/e2e_epochs/epochs_invalidate_block.parallel.test.ts +++ b/yarn-project/end-to-end/src/e2e_epochs/epochs_invalidate_block.parallel.test.ts @@ -17,6 +17,7 @@ import { EthAddress } from '@aztec/foundation/eth-address'; import { createLogger } from '@aztec/foundation/log'; import { promiseWithResolvers } from '@aztec/foundation/promise'; import { retryUntil } from '@aztec/foundation/retry'; +import { sleep } from '@aztec/foundation/sleep'; import { bufferToHex } from '@aztec/foundation/string'; import { executeTimeout, timeoutPromise } from '@aztec/foundation/timer'; import type { TestContract } from '@aztec/noir-test-contracts.js/Test'; @@ -25,6 +26,8 @@ import { L2BlockSourceEvents } from '@aztec/stdlib/block'; import { computeQuorum, getTimestampForSlot } from '@aztec/stdlib/epoch-helpers'; import { jest } from '@jest/globals'; +import { readdir, rm } from 'node:fs/promises'; +import { join } from 'node:path'; import type { Log } from 'viem'; import { privateKeyToAccount } from 'viem/accounts'; @@ -878,3 +881,280 @@ describe('e2e_epochs/epochs_invalidate_block', () => { }); }); }); + +// Handling of unavailable or malformed blobs. Injecting malformed blobs is difficult to +// achieve so we use the absence of blobs to achieve the same thing, it has the same failure mode. +// Whether the proposals carry valid or invalid attestations should not matter, +// the node should correctly handle blobs being unavailable +describe('e2e_epochs/epochs_blob_unavailable_prune', () => { + let context: EndToEndContext; + let logger: Logger; + let l1Client: ExtendedViemWalletClient; + let rollupContract: RollupContract; + let portOffset = 100; + + let test: EpochsTestContext; + let validators: (Operator & { privateKey: `0x${string}` })[]; + let nodes: AztecNodeService[]; + let testContract: TestContract; + let from: AztecAddress; + let nullifierSeed = 0; + + beforeEach(async () => { + validators = times(VALIDATOR_COUNT, i => { + const privateKey = bufferToHex(getPrivateKeyFromIndex(i + 3)!); + const attester = EthAddress.fromString(privateKeyToAccount(privateKey).address); + return { attester, withdrawer: attester, privateKey, bn254SecretKey: new SecretValue(Fr.random().toBigInt()) }; + }); + + test = await EpochsTestContext.setup({ + ethereumSlotDuration: 8, + aztecSlotDuration: 32, + aztecEpochDuration: 6, + blockDurationMs: 6000, + numberOfAccounts: 0, + initialValidators: validators, + mockGossipSubNetwork: true, + // Short proof window + no prover, so a checkpoint's epoch becomes prunable shortly after it ends. + aztecProofSubmissionEpochs: 1, + startProverNode: false, + aztecTargetCommitteeSize: VALIDATOR_COUNT, + // Disable all invalidation by default so a bad checkpoint stays canonical while we test it; the + // invalid-attestations test re-enables proposer invalidation only for its recovery phase. + secondsBeforeInvalidatingBlockAsCommitteeMember: Number.MAX_SAFE_INTEGER, + secondsBeforeInvalidatingBlockAsNonCommitteeMember: Number.MAX_SAFE_INTEGER, + skipInvalidateBlockAsProposer: true, + archiverPollingIntervalMS: 200, + anvilAccounts: 20, + anvilPort: BASE_ANVIL_PORT + ++portOffset, + // Require a tx to build a checkpoint and never build empty ones, so checkpoint production is driven + // deterministically by sending txs — we control exactly which checkpoint is "bad" and the chain stays + // frozen on it once we stop. This avoids racing sequencer-config changes against streaming empties. + minTxsPerBlock: 1, + maxTxsPerBlock: 1, + buildCheckpointIfEmpty: false, + skipInitialSequencer: true, + }); + + ({ context, logger, l1Client } = test); + rollupContract = new RollupContract(l1Client, test.rollup.address); + from = context.accounts[0]; // auto-created by setup + nullifierSeed = 0; + + const validatorNodes = validators.slice(0, NODE_COUNT); + nodes = await asyncMap(validatorNodes, ({ privateKey }) => + test.createValidatorNode([privateKey], { dontStartSequencer: true, minTxsPerBlock: 1, maxTxsPerBlock: 1 }), + ); + testContract = await test.registerTestContract(context.wallet); + logger.warn(`Started ${NODE_COUNT} validator nodes.`); + }); + + afterEach(async () => { + jest.restoreAllMocks(); + await test.teardown(); + }); + + // Feed one tx and wait for exactly one new checkpoint to land. With empty checkpoints disabled and + // maxTxsPerBlock 1, a single tx produces a single checkpoint, then production halts (chain frozen). + const produceCheckpoint = async () => { + const start = (await nodes[0].getChainTips()).checkpointed.checkpoint.number; + void testContract.methods + .emit_nullifier(BigInt(++nullifierSeed)) + .send({ from, wait: NO_WAIT }) + .catch(() => {}); + await test.waitUntilCheckpointNumber(CheckpointNumber(start + 1), test.L2_SLOT_DURATION_IN_S * 10); + }; + + // Keep feeding txs until nodes[0] reaches `target` (used to resume production and rebuild after a prune). + const driveToCheckpoint = (target: CheckpointNumber, timeoutSlots = 16) => + retryUntil( + async () => { + const tip = (await nodes[0].getChainTips()).checkpointed.checkpoint.number; + if (tip >= target) { + return true; + } + void testContract.methods + .emit_nullifier(BigInt(++nullifierSeed)) + .send({ from, wait: NO_WAIT }) + .catch(() => {}); + return false; + }, + `drive chain to checkpoint ${target}`, + test.L2_SLOT_DURATION_IN_S * timeoutSlots, + 2, + ); + + it('rejects a canonical invalid-attestations checkpoint from calldata without its blob', async () => { + const sequencers = nodes.map(node => node.getSequencer()!); + + // Make every proposer skip collecting attestations BEFORE they start, so the first checkpoint they + // produce deterministically lands with insufficient attestations (setting it just before a specific + // checkpoint is racy under pipelining — the proposer may have already locked the prior config). We + // then feed exactly one tx to produce that one bad checkpoint and stop, so it stays the canonical tip + // (invalidation is disabled fixture-wide). Keeping it canonical is the whole point — it forces the + // observer down the attestation-from-calldata path rather than the archive-mismatch filter, which only + // drops checkpoints that are no longer canonical (e.g. already invalidated/replaced). + // Disable every invalidation path on the sequencers too (the fixture-level settings do not all + // propagate to the running sequencer), otherwise proposers thrash — invalidating and re-proposing the + // bad checkpoint every slot — instead of leaving it canonical. + sequencers.forEach(s => + s.updateConfig({ + skipCollectingAttestations: true, + skipInvalidateBlockAsProposer: true, + secondsBeforeInvalidatingBlockAsCommitteeMember: Number.MAX_SAFE_INTEGER, + secondsBeforeInvalidatingBlockAsNonCommitteeMember: Number.MAX_SAFE_INTEGER, + }), + ); + // Create a point with invalid attestations + await Promise.all(sequencers.map(s => s.start())); + await produceCheckpoint(); + + const proposedEvents = await rollupContract.getCheckpointProposedEvents(1n, await l1Client.getBlockNumber()); + const badEvent = proposedEvents.reduce((a, b) => (b.args.checkpointNumber > a.args.checkpointNumber ? b : a)); + const badCheckpointNumber = badEvent.args.checkpointNumber; + const badL1Timestamp = (await l1Client.getBlock({ blockNumber: badEvent.l1BlockNumber })).timestamp; + logger.warn(`Froze chain on invalid-attestations checkpoint ${badCheckpointNumber}`); + + // Withhold its blob from the shared store. + const sharedRoot = join(test.context.config.dataDirectory!, 'shared-blobs'); + const namespaceDir = (await readdir(sharedRoot)).find(e => e.startsWith('aztec-')); + expect(namespaceDir).toBeDefined(); + const blobsDir = join(sharedRoot, namespaceDir!, 'blobs'); + const targetNames = badEvent.args.versionedBlobHashes.map(h => `0x${h.toString('hex')}.data`); + const before = await readdir(blobsDir); + for (const name of targetNames) { + expect(before).toContain(name); + await rm(join(blobsDir, name), { force: true }); + } + logger.warn(`Withheld ${targetNames.length} blob(s) for checkpoint ${badCheckpointNumber}`); + + // Late observer (started after the bad checkpoint was gossiped, so it has no proposed copy to promote) + // that also never promotes, forcing it to rely on L1. It must rely on the attestation check, not the + // archive-mismatch filter: that filter only drops checkpoints that are no longer canonical, and we have + // kept this one canonical (no invalidation). Its sync clock advancing past the checkpoint — while its + // blob is withheld — proves it rejected from calldata without fetching the blob; without the + // calldata-first ordering it would throw on the missing blob and its clock would stay frozen. + const observer = await test.createNonValidatorNode({ + skipArchiverInitialSync: true, + skipPromoteProposedCheckpointDuringL1Sync: true, + }); + await retryUntil( + async () => { + const ts = await observer.getSyncedL1Timestamp(); + return ts !== undefined && ts > badL1Timestamp; + }, + 'observer sync clock advances past the canonical invalid-attestations checkpoint without its blob', + test.L2_SLOT_DURATION_IN_S * 8, + 0.5, + ); + + // Resume honest production AND re-enable proposer invalidation: a proposer invalidates the bad + // checkpoint and the chain rebuilds; every node — validators and the observer — progresses past it. + logger.warn(`Resuming honest production to let the chain invalidate and rebuild`); + sequencers.forEach(s => + s.updateConfig({ skipCollectingAttestations: false, skipInvalidateBlockAsProposer: false }), + ); + await driveToCheckpoint(CheckpointNumber(badCheckpointNumber + 1), 20); + const allNodes = [...nodes, observer]; + await retryUntil( + async () => { + const tips = await Promise.all(allNodes.map(n => n.getChainTips().then(t => t.checkpointed.checkpoint.number))); + logger.info(`Node checkpoint tips: ${tips.join(', ')} (target > ${badCheckpointNumber})`); + return tips.every(n => n > badCheckpointNumber); + }, + 'chain invalidates the bad checkpoint and every node (incl. the observer) progresses past it', + test.L2_SLOT_DURATION_IN_S * 12, + 0.5, + ); + + logger.warn(`Test succeeded '${expect.getState().currentTestName}'`); + }); + + // This time, the checkpoint has valid attestations but is unavailable (same failure mode as malformed) + // In this situation the chain gets pruned as no proof can be produced. + it('skips a checkpoint with an unfetchable blob once its epoch can be pruned', async () => { + const sequencers = nodes.map(node => node.getSequencer()!); + + // Produce a couple of healthy checkpoints (valid attestations) by feeding txs, then stop feeding them. + // With empty checkpoints disabled, production halts and the latest checkpoint stays canonical — no + // honest proposer prunes it yet, isolating the observer's recovery to the fix. + await Promise.all(sequencers.map(s => s.start())); + await produceCheckpoint(); + await produceCheckpoint(); + + // Withhold the latest checkpoint's blob from the shared store. + const proposedEvents = await rollupContract.getCheckpointProposedEvents(1n, await l1Client.getBlockNumber()); + const badEvent = proposedEvents.reduce((a, b) => (b.args.checkpointNumber > a.args.checkpointNumber ? b : a)); + const badCheckpointNumber = badEvent.args.checkpointNumber; + logger.warn(`Froze chain at checkpoint ${badCheckpointNumber}`); + const badL1Timestamp = (await l1Client.getBlock({ blockNumber: badEvent!.l1BlockNumber })).timestamp; + // The file blob store namespaces blobs under `/aztec-{chainId}-{version}-0x{rollup}/blobs`. + const sharedRoot = join(test.context.config.dataDirectory!, 'shared-blobs'); + const namespaceDir = (await readdir(sharedRoot)).find(e => e.startsWith('aztec-')); + expect(namespaceDir).toBeDefined(); + const blobsDir = join(sharedRoot, namespaceDir!, 'blobs'); + const targetNames = badEvent!.args.versionedBlobHashes.map(h => `0x${h.toString('hex')}.data`); + const before = await readdir(blobsDir); + for (const name of targetNames) { + expect(before).toContain(name); // guards against the blob path layout drifting and silently passing + await rm(join(blobsDir, name), { force: true }); + } + expect((await readdir(blobsDir)).length).toEqual(before.length - targetNames.length); + logger.warn(`Withheld ${targetNames.length} blob(s) for checkpoint ${badCheckpointNumber} from ${blobsDir}`); + + // Spin up a fresh observer that never promotes (so it must fetch the blob) and does not block on its + // initial sync (so it can stall in the background while we drive the clock forward). + const observer = await test.createNonValidatorNode({ + skipArchiverInitialSync: true, + skipPromoteProposedCheckpointDuringL1Sync: true, + }); + + // It cannot get past the bad checkpoint while its epoch is still provable: the blob fetch throws every + // iteration and the sync clock never advances (getL1Timestamp stays undefined). + logger.warn(`Waiting for obersver node to attempt sync...`); + await sleep(test.L2_SLOT_DURATION_IN_S * 1000); + const frozenTs = await observer.getSyncedL1Timestamp(); + logger.warn(`Observer sync clock before window expiry: ${frozenTs} (bad checkpoint L1 ts ${badL1Timestamp})`); + expect(frozenTs === undefined || frozenTs < badL1Timestamp).toBeTrue(); + + // Advance L1 past the bad checkpoint's epoch proof-submission window, making it prunable. + const now = BigInt(await test.context.cheatCodes.eth.lastBlockTimestamp()); + const windowSeconds = (test.constants.proofSubmissionEpochs + 2) * test.epochDuration * test.L2_SLOT_DURATION_IN_S; + await test.context.cheatCodes.eth.warp(Number(now + BigInt(windowSeconds)), { resetBlockInterval: true }); + logger.warn(`Warped L1 forward by ${windowSeconds}s so checkpoint ${badCheckpointNumber} can be pruned`); + + // With the fix the observer skips the unfetchable checkpoint and its sync clock advances past it. + // Without the fix it keeps throwing on the withheld blob and getL1Timestamp stays frozen/undefined. + await retryUntil( + async () => { + const ts = await observer.getSyncedL1Timestamp(); + return ts !== undefined && ts > badL1Timestamp; + }, + 'observer sync clock unfreezes once the bad checkpoint becomes prunable', + test.L2_SLOT_DURATION_IN_S * 12, + 0.5, + ); + + // Resume production: the next proposer prunes the doomed unproven epoch on L1 (prune-on-propose, since + // its proof window has expired) and the chain rebuilds. Every node — validators and the observer that + // skipped the unfetchable checkpoint — must progress past it. This also implicitly asserts the prune + // happened: had the chain instead built on top of the bad checkpoint, the observer (which never + // ingested it) could not ingest any descendant, so it could never get past badCheckpointNumber. + logger.warn(`Resuming production to let the chain prune and rebuild`); + await driveToCheckpoint(CheckpointNumber(badCheckpointNumber + 1), 20); + + const allNodes = [...nodes, observer]; + await retryUntil( + async () => { + const tips = await Promise.all(allNodes.map(n => n.getChainTips().then(t => t.checkpointed.checkpoint.number))); + logger.info(`Node checkpoint tips: ${tips.join(', ')} (target > ${badCheckpointNumber})`); + return tips.every(n => n > badCheckpointNumber); + }, + 'chain prunes and every node (incl. the previously-stuck observer) progresses past the bad checkpoint', + test.L2_SLOT_DURATION_IN_S * 12, + 0.5, + ); + + logger.warn(`Test succeeded '${expect.getState().currentTestName}'`); + }); +});