Skip to content

Commit b9e8871

Browse files
committed
fix(lambda): add post-deregister busy check to prevent terminating active runners
The scale-down lambda had a TOCTOU race condition where a job could be assigned to a runner between checking its busy state and terminating the EC2 instance. This caused in-flight jobs to be killed mid-execution. The fix adds a post-deregistration busy re-check: 1. Check busy (fast-path to skip busy runners) 2. Deregister from GitHub (prevents new job assignment) 3. Re-check busy (now stable since no new jobs can be assigned) If the runner became busy between step 1 and 2, the in-flight job completes using its job-scoped OAuth token and the instance is left for orphan cleanup. Fixes #5085
1 parent efbaa6f commit b9e8871

File tree

2 files changed

+125
-25
lines changed

2 files changed

+125
-25
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,82 @@ describe('Scale down runners', () => {
336336
checkNonTerminated(runners);
337337
});
338338

339+
it(`Should not terminate a runner that became busy between deregister and post-deregister check.`, async () => {
340+
// setup: runner appears idle on first check, deregister succeeds,
341+
// but post-deregister re-check finds it busy (race condition)
342+
const runners = [
343+
createRunnerTestData(
344+
'race-condition-1',
345+
type,
346+
MINIMUM_TIME_RUNNING_IN_MINUTES + 1,
347+
true,
348+
false,
349+
false,
350+
),
351+
];
352+
353+
mockGitHubRunners(runners);
354+
mockAwsRunners(runners);
355+
356+
// First call returns not-busy (pre-deregister), second returns busy (post-deregister)
357+
let callCount = 0;
358+
const busyCheckMock = () => {
359+
callCount++;
360+
if (callCount <= 1) {
361+
return { data: { busy: false } };
362+
}
363+
return { data: { busy: true } };
364+
};
365+
mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock);
366+
mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock);
367+
368+
// act
369+
await expect(scaleDown()).resolves.not.toThrow();
370+
371+
// assert: runner should NOT be terminated
372+
checkTerminated(runners);
373+
checkNonTerminated(runners);
374+
});
375+
376+
it(`Should terminate a runner when post-deregister busy check returns 404.`, async () => {
377+
// setup: after deregistration, GitHub API returns 404 (runner fully removed)
378+
const runners = [
379+
createRunnerTestData(
380+
'deregistered-404',
381+
type,
382+
MINIMUM_TIME_RUNNING_IN_MINUTES + 1,
383+
true,
384+
false,
385+
true,
386+
),
387+
];
388+
389+
mockGitHubRunners(runners);
390+
mockAwsRunners(runners);
391+
392+
// First call returns not-busy, second throws 404
393+
let callCount = 0;
394+
const busyCheckMock = () => {
395+
callCount++;
396+
if (callCount <= 1) {
397+
return { data: { busy: false } };
398+
}
399+
const error = new Error('Not Found');
400+
(error as any).status = 404;
401+
Object.setPrototypeOf(error, RequestError.prototype);
402+
throw error;
403+
};
404+
mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock);
405+
mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock);
406+
407+
// act
408+
await expect(scaleDown()).resolves.not.toThrow();
409+
410+
// assert: runner should be terminated (404 = not busy)
411+
checkTerminated(runners);
412+
checkNonTerminated(runners);
413+
});
414+
339415
it(`Should terminate orphan (Non JIT)`, async () => {
340416
// setup
341417
const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false);

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -138,39 +138,63 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
138138
return;
139139
}
140140

141+
// Step 1: Check busy state as a fast-path to skip runners that are obviously busy.
141142
const states = await Promise.all(
142143
ghRunnerIds.map(async (ghRunnerId) => {
143-
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
144144
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
145145
}),
146146
);
147147

148-
if (states.every((busy) => busy === false)) {
149-
const statuses = await Promise.all(
150-
ghRunnerIds.map(async (ghRunnerId) => {
151-
return (
152-
ec2runner.type === 'Org'
153-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
154-
runner_id: ghRunnerId,
155-
org: ec2runner.owner,
156-
})
157-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
158-
runner_id: ghRunnerId,
159-
owner: ec2runner.owner.split('/')[0],
160-
repo: ec2runner.owner.split('/')[1],
161-
})
162-
).status;
163-
}),
164-
);
148+
if (!states.every((busy) => busy === false)) {
149+
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
150+
return;
151+
}
165152

166-
if (statuses.every((status) => status == 204)) {
167-
await terminateRunner(ec2runner.instanceId);
168-
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
169-
} else {
170-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
171-
}
153+
// Step 2: De-register the runner from GitHub. This prevents GitHub from assigning new jobs
154+
// to this runner, closing the race window where a job could be assigned between the busy
155+
// check above and the termination below.
156+
const statuses = await Promise.all(
157+
ghRunnerIds.map(async (ghRunnerId) => {
158+
return (
159+
ec2runner.type === 'Org'
160+
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
161+
runner_id: ghRunnerId,
162+
org: ec2runner.owner,
163+
})
164+
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
165+
runner_id: ghRunnerId,
166+
owner: ec2runner.owner.split('/')[0],
167+
repo: ec2runner.owner.split('/')[1],
168+
})
169+
).status;
170+
}),
171+
);
172+
173+
if (!statuses.every((status) => status == 204)) {
174+
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
175+
return;
176+
}
177+
178+
// Step 3: Re-check busy state after de-registration. A job may have been assigned between
179+
// step 1 and step 2. After de-registration no new jobs can be assigned, so this check is
180+
// now stable. If the runner is busy, the in-flight job will complete using its job-scoped
181+
// OAuth token (the runner worker uses credentials from the job message, not the runner
182+
// registration). We leave the instance running and it will be cleaned up as an orphan.
183+
const postDeregisterStates = await Promise.all(
184+
ghRunnerIds.map(async (ghRunnerId) => {
185+
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
186+
}),
187+
);
188+
189+
if (postDeregisterStates.every((busy) => busy === false)) {
190+
await terminateRunner(ec2runner.instanceId);
191+
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
172192
} else {
173-
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
193+
logger.warn(
194+
`Runner '${ec2runner.instanceId}' became busy between idle check and de-registration. ` +
195+
`Skipping termination to allow the in-flight job to complete. ` +
196+
`The instance will be cleaned up as an orphan on a subsequent cycle.`,
197+
);
174198
}
175199
} catch (e) {
176200
logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {

0 commit comments

Comments
 (0)