Skip to content

Commit 763938c

Browse files
committed
fix: prevent EC2 termination when GitHub runner de-registration fails
When the scale-down Lambda fails to de-register a runner from GitHub (even after automatic retries via @octokit/plugin-retry), the EC2 instance should NOT be terminated. This prevents stale runner entries in GitHub org settings. This change complements PR #4990 which added @octokit/plugin-retry for automatic retries. While that handles transient failures, this ensures that if de-registration ultimately fails, we don't leave orphaned GitHub runner entries by terminating the EC2 instance prematurely. Key changes: - Extract deleteGitHubRunner() helper that catches errors per-runner - Only terminate EC2 instance if ALL GitHub de-registrations succeed - If any de-registration fails, leave instance running for next cycle The @octokit/plugin-retry (added in #4990) handles automatic retries at the client level, so no custom retry logic is needed here. Tests: - Add test verifying EC2 is NOT terminated when de-registration fails
1 parent e4b12ae commit 763938c

File tree

2 files changed

+69
-17
lines changed

2 files changed

+69
-17
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,35 @@ describe('Scale down runners', () => {
631631
await expect(scaleDown()).resolves.not.toThrow();
632632
});
633633

634+
it(`Should not terminate instance when de-registration throws an error.`, async () => {
635+
// setup - runner should NOT be terminated because de-registration fails
636+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
637+
638+
const error502 = new RequestError('Server Error', 502, {
639+
request: {
640+
method: 'DELETE',
641+
url: 'https://api.github.com/test',
642+
headers: {},
643+
},
644+
});
645+
646+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
647+
throw error502;
648+
});
649+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
650+
throw error502;
651+
});
652+
653+
mockGitHubRunners(runners);
654+
mockAwsRunners(runners);
655+
656+
// act
657+
await expect(scaleDown()).resolves.not.toThrow();
658+
659+
// assert - should NOT terminate since de-registration failed
660+
expect(terminateRunner).not.toHaveBeenCalled();
661+
});
662+
634663
const evictionStrategies = ['oldest_first', 'newest_first'];
635664
describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => {
636665
const defaultConfig = {

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,33 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127127
return launchTimePlusMinimum < now;
128128
}
129129

130+
async function deleteGitHubRunner(
131+
githubAppClient: Octokit,
132+
ec2runner: RunnerInfo,
133+
ghRunnerId: number,
134+
): Promise<{ ghRunnerId: number; status: number; success: boolean }> {
135+
try {
136+
const response =
137+
ec2runner.type === 'Org'
138+
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
139+
runner_id: ghRunnerId,
140+
org: ec2runner.owner,
141+
})
142+
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
143+
runner_id: ghRunnerId,
144+
owner: ec2runner.owner.split('/')[0],
145+
repo: ec2runner.owner.split('/')[1],
146+
});
147+
return { ghRunnerId, status: response.status, success: response.status === 204 };
148+
} catch (error) {
149+
logger.error(
150+
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. Error: ${error}`,
151+
{ error: error as Error },
152+
);
153+
return { ghRunnerId, status: 0, success: false };
154+
}
155+
}
156+
130157
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
131158
const githubAppClient = await getOrCreateOctokit(ec2runner);
132159
try {
@@ -146,28 +173,24 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
146173
);
147174

148175
if (states.every((busy) => busy === false)) {
149-
const statuses = await Promise.all(
150-
ghRunnerIds.map(async (ghRunnerId) => {
151-
return (
152-
ec2runner.type === 'Org'
153-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
154-
runner_id: ghRunnerId,
155-
org: ec2runner.owner,
156-
})
157-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
158-
runner_id: ghRunnerId,
159-
owner: ec2runner.owner.split('/')[0],
160-
repo: ec2runner.owner.split('/')[1],
161-
})
162-
).status;
163-
}),
176+
const results = await Promise.all(
177+
ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubAppClient, ec2runner, ghRunnerId)),
164178
);
165179

166-
if (statuses.every((status) => status == 204)) {
180+
const allSucceeded = results.every((r) => r.success);
181+
const failedRunners = results.filter((r) => !r.success);
182+
183+
if (allSucceeded) {
167184
await terminateRunner(ec2runner.instanceId);
168185
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
169186
} else {
170-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
187+
// Only terminate EC2 if we successfully de-registered from GitHub
188+
// Otherwise, leave the instance running so the next scale-down cycle can retry
189+
logger.error(
190+
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
191+
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
192+
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
193+
);
171194
}
172195
} else {
173196
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);

0 commit comments

Comments
 (0)