Skip to content

Commit 0bea361

Browse files
committed
fix: combine upstream fixes for scale-down and scale-up lambdas
- Prevent EC2 termination when GitHub runner de-registration fails (upstream PR github-aws-runners#5061) - Prevent negative TotalTargetCapacity when runners exceed maximum (upstream PR github-aws-runners#5062) Changes: - scale-down.ts: Extract deleteGitHubRunner() helper, only terminate EC2 if all de-registrations succeed - scale-up.ts: Add Math.max(0, ...) guard to prevent negative runner count
1 parent 6dc97d5 commit 0bea361

File tree

4 files changed

+114
-34
lines changed

4 files changed

+114
-34
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,35 @@ describe('Scale down runners', () => {
631631
await expect(scaleDown()).resolves.not.toThrow();
632632
});
633633

634+
it(`Should not terminate instance when de-registration throws an error.`, async () => {
635+
// setup - runner should NOT be terminated because de-registration fails
636+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
637+
638+
const error502 = new RequestError('Server Error', 502, {
639+
request: {
640+
method: 'DELETE',
641+
url: 'https://api.github.com/test',
642+
headers: {},
643+
},
644+
});
645+
646+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
647+
throw error502;
648+
});
649+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
650+
throw error502;
651+
});
652+
653+
mockGitHubRunners(runners);
654+
mockAwsRunners(runners);
655+
656+
// act
657+
await expect(scaleDown()).resolves.not.toThrow();
658+
659+
// assert - should NOT terminate since de-registration failed
660+
expect(terminateRunner).not.toHaveBeenCalled();
661+
});
662+
634663
const evictionStrategies = ['oldest_first', 'newest_first'];
635664
describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => {
636665
const defaultConfig = {

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 64 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,20 @@ async function listGitHubRunners(runner: RunnerInfo): Promise<GhRunners> {
103103

104104
logger.debug(`[listGithubRunners] Cache miss for ${key}`);
105105
const client = await getOrCreateOctokit(runner);
106-
const runners =
107-
runner.type === 'Org'
108-
? await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
109-
org: runner.owner,
110-
per_page: 100,
111-
})
112-
: await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
113-
owner: runner.owner.split('/')[0],
114-
repo: runner.owner.split('/')[1],
115-
per_page: 100,
116-
});
106+
let runners;
107+
if (runner.type === 'Org') {
108+
runners = await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
109+
org: runner.owner,
110+
per_page: 100,
111+
});
112+
} else {
113+
const [owner, repo] = runner.owner.split('/');
114+
runners = await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
115+
owner,
116+
repo,
117+
per_page: 100,
118+
});
119+
}
117120
githubCache.runners.set(key, runners);
118121
logger.debug(`[listGithubRunners] Cache set for ${key}`);
119122
logger.debug(`[listGithubRunners] Runners: ${JSON.stringify(runners)}`);
@@ -127,8 +130,39 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127130
return launchTimePlusMinimum < now;
128131
}
129132

133+
async function deleteGitHubRunner(
134+
githubInstallationClient: Octokit,
135+
ec2runner: RunnerInfo,
136+
ghRunnerId: number,
137+
): Promise<{ ghRunnerId: number; status: number; success: boolean }> {
138+
try {
139+
let response;
140+
if (ec2runner.type === 'Org') {
141+
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromOrg({
142+
runner_id: ghRunnerId,
143+
org: ec2runner.owner,
144+
});
145+
} else {
146+
const [owner, repo] = ec2runner.owner.split('/');
147+
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromRepo({
148+
runner_id: ghRunnerId,
149+
owner,
150+
repo,
151+
});
152+
}
153+
return { ghRunnerId, status: response.status, success: response.status === 204 };
154+
} catch (error) {
155+
logger.error(
156+
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. ` +
157+
`Error: ${error instanceof Error ? error.message : String(error)}`,
158+
{ error },
159+
);
160+
return { ghRunnerId, status: 0, success: false };
161+
}
162+
}
163+
130164
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
131-
const githubAppClient = await getOrCreateOctokit(ec2runner);
165+
const githubInstallationClient = await getOrCreateOctokit(ec2runner);
132166
try {
133167
const runnerList = ec2runner as unknown as RunnerList;
134168
if (runnerList.bypassRemoval) {
@@ -141,41 +175,38 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
141175
const states = await Promise.all(
142176
ghRunnerIds.map(async (ghRunnerId) => {
143177
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
144-
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
178+
return await getGitHubRunnerBusyState(githubInstallationClient, ec2runner, ghRunnerId);
145179
}),
146180
);
147181

148182
if (states.every((busy) => busy === false)) {
149-
const statuses = await Promise.all(
150-
ghRunnerIds.map(async (ghRunnerId) => {
151-
return (
152-
ec2runner.type === 'Org'
153-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
154-
runner_id: ghRunnerId,
155-
org: ec2runner.owner,
156-
})
157-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
158-
runner_id: ghRunnerId,
159-
owner: ec2runner.owner.split('/')[0],
160-
repo: ec2runner.owner.split('/')[1],
161-
})
162-
).status;
163-
}),
183+
const results = await Promise.all(
184+
ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubInstallationClient, ec2runner, ghRunnerId)),
164185
);
165186

166-
if (statuses.every((status) => status == 204)) {
187+
const allSucceeded = results.every((r) => r.success);
188+
const failedRunners = results.filter((r) => !r.success);
189+
190+
if (allSucceeded) {
167191
await terminateRunner(ec2runner.instanceId);
168192
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
169193
} else {
170-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
194+
// Only terminate EC2 if we successfully de-registered from GitHub
195+
// Otherwise, leave the instance running so the next scale-down cycle can retry
196+
logger.error(
197+
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
198+
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
199+
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
200+
);
171201
}
172202
} else {
173203
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
174204
}
175205
} catch (e) {
176-
logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {
177-
error: e as Error,
178-
});
206+
logger.error(
207+
`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e instanceof Error ? e.message : String(e)}`,
208+
{ error: e },
209+
);
179210
}
180211
}
181212

lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,24 @@ describe('scaleUp with GHES', () => {
228228
expect(mockOctokit.actions.createRegistrationTokenForRepo).not.toBeCalled();
229229
});
230230

231+
it('does not create runners when current runners exceed maximum (race condition)', async () => {
232+
process.env.RUNNERS_MAXIMUM_COUNT = '5';
233+
process.env.ENABLE_EPHEMERAL_RUNNERS = 'false';
234+
// Simulate race condition where pool lambda created more runners than max
235+
mockListRunners.mockImplementation(async () =>
236+
Array.from({ length: 10 }, (_, i) => ({
237+
instanceId: `i-${i}`,
238+
launchTime: new Date(),
239+
type: 'Org',
240+
owner: TEST_DATA_SINGLE.repositoryOwner,
241+
})),
242+
);
243+
await scaleUpModule.scaleUp(TEST_DATA);
244+
// Should not attempt to create runners (would be negative without fix)
245+
expect(createRunner).not.toBeCalled();
246+
expect(mockOctokit.actions.createRegistrationTokenForOrg).not.toBeCalled();
247+
});
248+
231249
it('does create a runner if maximum is set to -1', async () => {
232250
process.env.RUNNERS_MAXIMUM_COUNT = '-1';
233251
process.env.ENABLE_EPHEMERAL_RUNNERS = 'false';

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,12 +418,14 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
418418
});
419419

420420
// Calculate how many runners we want to create.
421+
// Use Math.max(0, ...) to ensure we never attempt to create a negative number of runners,
422+
// which can happen when currentRunners exceeds maximumRunners due to pool/scale-up race conditions.
421423
const newRunners =
422424
maximumRunners === -1
423425
? // If we don't have an upper limit, scale up by the number of new jobs.
424426
scaleUp
425427
: // Otherwise, we do have a limit, so work out if `scaleUp` would exceed it.
426-
Math.min(scaleUp, maximumRunners - currentRunners);
428+
Math.max(0, Math.min(scaleUp, maximumRunners - currentRunners));
427429

428430
const missingInstanceCount = Math.max(0, scaleUp - newRunners);
429431

0 commit comments

Comments
 (0)