Skip to content

Commit ed30bf8

Browse files
committed
Add SQS-based deregistration retry for busy runners (C-243)
When GitHub returns 422 on runner deletion (runner executing a job), instead of silently dropping the attempt, enqueue a retry message to SQS with a 5-minute delay. By that time the EC2 instance has been terminated and the runner appears offline, allowing clean deletion. Changes: - deregister.ts: send 422 failures to DEREGISTER_RETRY_QUEUE_URL SQS queue; add handleDeregisterRetry for processing retry messages - lambda.ts: export deregisterRetry SQS handler - package.json: add @aws-sdk/client-sqs dependency - scale-down.ts: remove reconcileGitHubRunners polling (replaced by SQS) - modules/multi-runner: add environment_variables to instance_termination_watcher variable and pass through to Lambda config - modules/termination-watcher: merge caller-supplied environment_variables into notification and handler Lambda env var configs
1 parent 3542b85 commit ed30bf8

File tree

9 files changed

+10603
-7000
lines changed

9 files changed

+10603
-7000
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 0 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -313,91 +313,6 @@ function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
313313
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
314314
}
315315

316-
async function reconcileGitHubRunners(environment: string, ec2Runners: RunnerList[]): Promise<void> {
317-
const offlineThresholdMinutes = parseInt(process.env.OFFLINE_RUNNER_DEREGISTER_MINUTES ?? '10');
318-
if (offlineThresholdMinutes <= 0) {
319-
logger.debug('Offline runner reconciliation is disabled (threshold <= 0)');
320-
return;
321-
}
322-
323-
const ec2InstanceIds = new Set(ec2Runners.map((r) => r.instanceId));
324-
325-
// Build a set of unique owners/types from the EC2 runners we know about.
326-
// If there are no EC2 runners, we still need at least one owner to query GitHub.
327-
// Fall back to environment tags to find the org.
328-
const ownerTypes = new Map<string, string>();
329-
for (const r of ec2Runners) {
330-
if (r.owner && r.type) {
331-
ownerTypes.set(r.owner, r.type);
332-
}
333-
}
334-
335-
// If no EC2 runners exist, we can't determine the owner to query GitHub.
336-
// This is fine — the scale-up Lambda will handle it once new runners register.
337-
if (ownerTypes.size === 0) {
338-
logger.debug('No EC2 runners with owner tags found, skipping GitHub runner reconciliation');
339-
return;
340-
}
341-
342-
for (const [owner, runnerType] of ownerTypes) {
343-
try {
344-
// Create a synthetic RunnerInfo to reuse the existing GitHub client helpers
345-
const syntheticRunner: RunnerInfo = { instanceId: 'reconciler', owner, type: runnerType };
346-
const ghRunners = await listGitHubRunners(syntheticRunner);
347-
348-
// Find GitHub runners whose name contains an environment prefix that matches ours,
349-
// that are offline, and have no corresponding EC2 instance
350-
const orphanedGhRunners = ghRunners.filter((ghRunner: { name: string; status: string; id: number }) => {
351-
if (ghRunner.status !== 'offline') return false;
352-
// Check if this runner's EC2 instance still exists
353-
const matchesEc2 = Array.from(ec2InstanceIds).some((instanceId) => ghRunner.name.includes(instanceId));
354-
return !matchesEc2;
355-
});
356-
357-
if (orphanedGhRunners.length === 0) {
358-
logger.debug(`No orphaned GitHub runners found for owner '${owner}'`);
359-
continue;
360-
}
361-
362-
logger.info(
363-
`Found ${orphanedGhRunners.length} offline GitHub runner(s) with no EC2 instance for owner '${owner}'`,
364-
);
365-
366-
const client = await getOrCreateOctokit(syntheticRunner);
367-
for (const ghRunner of orphanedGhRunners) {
368-
try {
369-
if (runnerType === 'Org') {
370-
await client.actions.deleteSelfHostedRunnerFromOrg({
371-
org: owner,
372-
runner_id: (ghRunner as { id: number }).id,
373-
});
374-
} else {
375-
const [repoOwner, repo] = owner.split('/');
376-
await client.actions.deleteSelfHostedRunnerFromRepo({
377-
owner: repoOwner,
378-
repo,
379-
runner_id: (ghRunner as { id: number }).id,
380-
});
381-
}
382-
logger.info(`Deregistered orphaned GitHub runner '${(ghRunner as { name: string }).name}' (ID: ${(ghRunner as { id: number }).id})`);
383-
} catch (error) {
384-
if (error instanceof RequestError && error.status === 422) {
385-
logger.warn(
386-
`Cannot deregister runner '${(ghRunner as { name: string }).name}' — still marked as busy. Will retry next cycle.`,
387-
);
388-
} else {
389-
logger.error(`Failed to deregister orphaned runner '${(ghRunner as { name: string }).name}'`, {
390-
error: error as Error,
391-
});
392-
}
393-
}
394-
}
395-
} catch (error) {
396-
logger.warn(`Failed to reconcile GitHub runners for owner '${owner}'`, { error: error as Error });
397-
}
398-
}
399-
}
400-
401316
export async function scaleDown(): Promise<void> {
402317
githubCache.reset();
403318
const environment = process.env.ENVIRONMENT;
@@ -412,11 +327,6 @@ export async function scaleDown(): Promise<void> {
412327
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);
413328
logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`);
414329

415-
// Reconcile: deregister GitHub runners whose EC2 instances no longer exist.
416-
// This prevents deadlocks where offline ghost runners count toward the max,
417-
// blocking scale-up from launching replacements.
418-
await reconcileGitHubRunners(environment, ec2Runners);
419-
420330
if (activeEc2RunnersCount === 0) {
421331
logger.debug(`No active runners found for environment: '${environment}'`);
422332
return;

lambdas/functions/termination-watcher/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"@aws-github-runner/aws-powertools-util": "*",
2727
"@aws-github-runner/aws-ssm-util": "*",
2828
"@aws-sdk/client-ec2": "^3.984.0",
29+
"@aws-sdk/client-sqs": "^3.984.0",
2930
"@middy/core": "^6.4.5",
3031
"@octokit/auth-app": "8.2.0",
3132
"@octokit/core": "7.0.6",

lambdas/functions/termination-watcher/src/deregister.ts

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,22 @@ import { Octokit } from '@octokit/rest';
33
import { throttling } from '@octokit/plugin-throttling';
44
import { request } from '@octokit/request';
55
import { Instance } from '@aws-sdk/client-ec2';
6+
import { SQSClient, SendMessageCommand } from '@aws-sdk/client-sqs';
67
import { createChildLogger } from '@aws-github-runner/aws-powertools-util';
78
import { getParameter } from '@aws-github-runner/aws-ssm-util';
89
import type { EndpointDefaults } from '@octokit/types';
910
import type { Config } from './ConfigResolver';
1011

12+
export interface DeregisterRetryMessage {
13+
instanceId: string;
14+
owner: string;
15+
runnerType: string;
16+
runnerId: number;
17+
retryCount: number;
18+
}
19+
20+
const sqsClient = new SQSClient({ region: process.env.AWS_REGION });
21+
1122
const logger = createChildLogger('deregister');
1223

1324
export function createThrottleOptions() {
@@ -198,20 +209,81 @@ export async function deregisterRunner(instance: Instance, config: Config): Prom
198209
});
199210
} catch (error) {
200211
// GitHub returns 422 when a runner is currently executing a job.
201-
// The runner will become offline after the instance terminates, and the
202-
// scale-down Lambda's reconciliation loop will clean it up on its next cycle.
212+
// Queue a delayed retry — the instance will be terminated by EC2 shortly,
213+
// and the runner will appear offline when we retry in 5 minutes.
203214
const isRunnerBusy = error instanceof Error && 'status' in error && (error as { status: number }).status === 422;
204215
if (isRunnerBusy) {
205-
logger.warn('Runner is currently busy, cannot deregister now. Scale-down reconciliation will clean it up.', {
216+
const queueUrl = process.env.DEREGISTER_RETRY_QUEUE_URL;
217+
if (queueUrl) {
218+
await queueDeregisterRetry(queueUrl, { instanceId, owner, runnerType, runnerId: 0, retryCount: 0 });
219+
logger.warn('Runner is busy — queued deregistration retry in 5 minutes via SQS', { instanceId, owner });
220+
} else {
221+
logger.warn('Runner is busy and DEREGISTER_RETRY_QUEUE_URL is not set — deregistration skipped', {
222+
instanceId,
223+
owner,
224+
});
225+
}
226+
} else {
227+
logger.error('Failed to deregister runner from GitHub', {
206228
instanceId,
207229
owner,
230+
error: error as Error,
231+
});
232+
}
233+
}
234+
}
235+
236+
async function queueDeregisterRetry(queueUrl: string, message: DeregisterRetryMessage): Promise<void> {
237+
const command = new SendMessageCommand({
238+
QueueUrl: queueUrl,
239+
MessageBody: JSON.stringify(message),
240+
});
241+
await sqsClient.send(command);
242+
}
243+
244+
export async function handleDeregisterRetry(queueUrl: string, message: DeregisterRetryMessage): Promise<void> {
245+
const { instanceId, owner, runnerType, retryCount } = message;
246+
logger.info('Processing deregistration retry from SQS', { instanceId, owner, runnerType, retryCount });
247+
248+
try {
249+
const appOctokit = await createAuthenticatedClient('');
250+
const installationOctokit = await createInstallationClient(appOctokit, owner, runnerType, '');
251+
252+
const runner = await findRunnerByInstanceId(installationOctokit, owner, instanceId, runnerType);
253+
if (!runner) {
254+
logger.info('Runner not found in GitHub — already deregistered or never registered', { instanceId, owner });
255+
return;
256+
}
257+
258+
await deleteRunner(installationOctokit, owner, runner.id, runnerType);
259+
logger.info('Successfully deregistered runner via SQS retry', {
260+
instanceId,
261+
runnerId: runner.id,
262+
runnerName: runner.name,
263+
owner,
264+
retryCount,
265+
});
266+
} catch (error) {
267+
const isRunnerBusy = error instanceof Error && 'status' in error && (error as { status: number }).status === 422;
268+
if (isRunnerBusy) {
269+
// Re-enqueue for another retry — SQS maxReceiveCount DLQ will stop after 3 total attempts.
270+
// Re-send explicitly so each retry resets the delay (SQS visibility timeout applies on re-receive,
271+
// but re-sending gives us the full 5-minute DelaySeconds again).
272+
await queueDeregisterRetry(queueUrl, { ...message, retryCount: retryCount + 1 });
273+
logger.warn('Runner still busy on retry — re-queued for another attempt', {
274+
instanceId,
275+
owner,
276+
retryCount: retryCount + 1,
208277
});
209278
} else {
210-
logger.error('Failed to deregister runner from GitHub', {
279+
logger.error('Failed to deregister runner on retry', {
211280
instanceId,
212281
owner,
282+
retryCount,
213283
error: error as Error,
214284
});
285+
// Re-throw so SQS treats this as a failure and routes to DLQ after maxReceiveCount
286+
throw error;
215287
}
216288
}
217289
}

lambdas/functions/termination-watcher/src/lambda.ts

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import middy from '@middy/core';
22
import { captureLambdaHandler, logger, metrics, setContext, tracer } from '@aws-github-runner/aws-powertools-util';
33
import { logMetrics } from '@aws-lambda-powertools/metrics/middleware';
4-
import { Context } from 'aws-lambda';
4+
import { Context, SQSEvent } from 'aws-lambda';
55

66
import { handle as handleTerminationWarning } from './termination-warning';
77
import { handle as handleTermination } from './termination';
8+
import { handleDeregisterRetry, DeregisterRetryMessage } from './deregister';
89
import { BidEvictedDetail, BidEvictedEvent, SpotInterruptionWarning, SpotTerminationDetail } from './types';
910
import { Config } from './ConfigResolver';
1011

@@ -37,6 +38,29 @@ export async function termination(event: BidEvictedEvent<BidEvictedDetail>, cont
3738
}
3839
}
3940

41+
export async function deregisterRetry(event: SQSEvent, context: Context): Promise<void> {
42+
setContext(context, 'lambda.ts');
43+
logger.logEventIfEnabled(event);
44+
logger.debug('Processing SQS deregister retry batch', { recordCount: event.Records.length });
45+
46+
const queueUrl = process.env.DEREGISTER_RETRY_QUEUE_URL;
47+
if (!queueUrl) {
48+
logger.error('DEREGISTER_RETRY_QUEUE_URL is not set — cannot process retry messages');
49+
return;
50+
}
51+
52+
for (const record of event.Records) {
53+
try {
54+
const message = JSON.parse(record.body) as DeregisterRetryMessage;
55+
await handleDeregisterRetry(queueUrl, message);
56+
} catch (e) {
57+
logger.error(`Failed to process SQS record ${record.messageId}`, { error: e as Error });
58+
// Re-throw to mark the message as failed so SQS can retry or route to DLQ
59+
throw e;
60+
}
61+
}
62+
}
63+
4064
const addMiddleware = () => {
4165
const middleware = middy(interruptionWarning);
4266

0 commit comments

Comments
 (0)