Skip to content

Commit cd612ca

Browse files
committed
Add GitHub runner reconciliation to prevent ghost runner deadlocks
When a runner is terminated while executing a job (e.g., spot reclamation, power disruption), the GitHub API returns 422 and refuses to delete it. The runner stays registered as "offline" indefinitely, counting toward the maximum runner limit and preventing new runners from launching. Changes: - scale-down.ts: Add reconcileGitHubRunners() that runs every scale-down cycle (every 5 minutes). Lists all GitHub runners, compares against live EC2 instances, and deregisters any offline runners whose instances no longer exist. - deregister.ts: Improve 422 error handling — log as warning instead of error since the scale-down reconciliation will clean it up. The reconciliation is controlled by OFFLINE_RUNNER_DEREGISTER_MINUTES env var (defaults to 10). Set to 0 to disable.
1 parent db6a268 commit cd612ca

File tree

2 files changed

+106
-5
lines changed

2 files changed

+106
-5
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,91 @@ function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] {
313313
return ec2runners.filter((ec2Runner) => ec2Runner.type && !ec2Runner.orphan) as RunnerInfo[];
314314
}
315315

316+
async function reconcileGitHubRunners(environment: string, ec2Runners: RunnerList[]): Promise<void> {
317+
const offlineThresholdMinutes = parseInt(process.env.OFFLINE_RUNNER_DEREGISTER_MINUTES ?? '10');
318+
if (offlineThresholdMinutes <= 0) {
319+
logger.debug('Offline runner reconciliation is disabled (threshold <= 0)');
320+
return;
321+
}
322+
323+
const ec2InstanceIds = new Set(ec2Runners.map((r) => r.instanceId));
324+
325+
// Build a set of unique owners/types from the EC2 runners we know about.
326+
// If there are no EC2 runners, we still need at least one owner to query GitHub.
327+
// Fall back to environment tags to find the org.
328+
const ownerTypes = new Map<string, string>();
329+
for (const r of ec2Runners) {
330+
if (r.owner && r.type) {
331+
ownerTypes.set(r.owner, r.type);
332+
}
333+
}
334+
335+
// If no EC2 runners exist, we can't determine the owner to query GitHub.
336+
// This is fine — the scale-up Lambda will handle it once new runners register.
337+
if (ownerTypes.size === 0) {
338+
logger.debug('No EC2 runners with owner tags found, skipping GitHub runner reconciliation');
339+
return;
340+
}
341+
342+
for (const [owner, runnerType] of ownerTypes) {
343+
try {
344+
// Create a synthetic RunnerInfo to reuse the existing GitHub client helpers
345+
const syntheticRunner: RunnerInfo = { instanceId: 'reconciler', owner, type: runnerType };
346+
const ghRunners = await listGitHubRunners(syntheticRunner);
347+
348+
// Find GitHub runners whose name contains an environment prefix that matches ours,
349+
// that are offline, and have no corresponding EC2 instance
350+
const orphanedGhRunners = ghRunners.filter((ghRunner: { name: string; status: string; id: number }) => {
351+
if (ghRunner.status !== 'offline') return false;
352+
// Check if this runner's EC2 instance still exists
353+
const matchesEc2 = Array.from(ec2InstanceIds).some((instanceId) => ghRunner.name.includes(instanceId));
354+
return !matchesEc2;
355+
});
356+
357+
if (orphanedGhRunners.length === 0) {
358+
logger.debug(`No orphaned GitHub runners found for owner '${owner}'`);
359+
continue;
360+
}
361+
362+
logger.info(
363+
`Found ${orphanedGhRunners.length} offline GitHub runner(s) with no EC2 instance for owner '${owner}'`,
364+
);
365+
366+
const client = await getOrCreateOctokit(syntheticRunner);
367+
for (const ghRunner of orphanedGhRunners) {
368+
try {
369+
if (runnerType === 'Org') {
370+
await client.actions.deleteSelfHostedRunnerFromOrg({
371+
org: owner,
372+
runner_id: (ghRunner as { id: number }).id,
373+
});
374+
} else {
375+
const [repoOwner, repo] = owner.split('/');
376+
await client.actions.deleteSelfHostedRunnerFromRepo({
377+
owner: repoOwner,
378+
repo,
379+
runner_id: (ghRunner as { id: number }).id,
380+
});
381+
}
382+
logger.info(`Deregistered orphaned GitHub runner '${(ghRunner as { name: string }).name}' (ID: ${(ghRunner as { id: number }).id})`);
383+
} catch (error) {
384+
if (error instanceof RequestError && error.status === 422) {
385+
logger.warn(
386+
`Cannot deregister runner '${(ghRunner as { name: string }).name}' — still marked as busy. Will retry next cycle.`,
387+
);
388+
} else {
389+
logger.error(`Failed to deregister orphaned runner '${(ghRunner as { name: string }).name}'`, {
390+
error: error as Error,
391+
});
392+
}
393+
}
394+
}
395+
} catch (error) {
396+
logger.warn(`Failed to reconcile GitHub runners for owner '${owner}'`, { error: error as Error });
397+
}
398+
}
399+
}
400+
316401
export async function scaleDown(): Promise<void> {
317402
githubCache.reset();
318403
const environment = process.env.ENVIRONMENT;
@@ -327,6 +412,11 @@ export async function scaleDown(): Promise<void> {
327412
logger.info(`Found: '${activeEc2RunnersCount}' active GitHub EC2 runner instances before clean-up.`);
328413
logger.debug(`Active GitHub EC2 runner instances: ${JSON.stringify(ec2Runners)}`);
329414

415+
// Reconcile: deregister GitHub runners whose EC2 instances no longer exist.
416+
// This prevents deadlocks where offline ghost runners count toward the max,
417+
// blocking scale-up from launching replacements.
418+
await reconcileGitHubRunners(environment, ec2Runners);
419+
330420
if (activeEc2RunnersCount === 0) {
331421
logger.debug(`No active runners found for environment: '${environment}'`);
332422
return;

lambdas/functions/termination-watcher/src/deregister.ts

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -195,10 +195,21 @@ export async function deregisterRunner(instance: Instance, config: Config): Prom
195195
owner,
196196
});
197197
} catch (error) {
198-
logger.error('Failed to deregister runner from GitHub', {
199-
instanceId,
200-
owner,
201-
error: error as Error,
202-
});
198+
// GitHub returns 422 when a runner is currently executing a job.
199+
// The runner will become offline after the instance terminates, and the
200+
// scale-down Lambda's reconciliation loop will clean it up on its next cycle.
201+
const isRunnerBusy = error instanceof Error && 'status' in error && (error as { status: number }).status === 422;
202+
if (isRunnerBusy) {
203+
logger.warn('Runner is currently busy, cannot deregister now. Scale-down reconciliation will clean it up.', {
204+
instanceId,
205+
owner,
206+
});
207+
} else {
208+
logger.error('Failed to deregister runner from GitHub', {
209+
instanceId,
210+
owner,
211+
error: error as Error,
212+
});
213+
}
203214
}
204215
}

0 commit comments

Comments
 (0)