Skip to content

Commit aa2e75b

Browse files
committed
fix: prevent EC2 termination when GitHub runner de-registration fails
When the scale-down Lambda fails to de-register a runner from GitHub (even after automatic retries via @octokit/plugin-retry), the EC2 instance should NOT be terminated. This prevents stale runner entries in GitHub org settings. This change complements PR #4990 which added @octokit/plugin-retry for automatic retries. While that handles transient failures, this ensures that if de-registration ultimately fails, we don't leave orphaned GitHub runner entries by terminating the EC2 instance prematurely. Key changes: - Extract deleteGitHubRunner() helper that catches errors per-runner - Only terminate EC2 instance if ALL GitHub de-registrations succeed - If any de-registration fails, leave instance running for next cycle The @octokit/plugin-retry (added in #4990) handles automatic retries at the client level, so no custom retry logic is needed here. Tests: - Add test verifying EC2 is NOT terminated when de-registration fails
1 parent e4b12ae commit aa2e75b

File tree

2 files changed

+75
-19
lines changed

2 files changed

+75
-19
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,35 @@ describe('Scale down runners', () => {
631631
await expect(scaleDown()).resolves.not.toThrow();
632632
});
633633

634+
it(`Should not terminate instance when de-registration throws an error.`, async () => {
635+
// setup - runner should NOT be terminated because de-registration fails
636+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
637+
638+
const error502 = new RequestError('Server Error', 502, {
639+
request: {
640+
method: 'DELETE',
641+
url: 'https://api.github.com/test',
642+
headers: {},
643+
},
644+
});
645+
646+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
647+
throw error502;
648+
});
649+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
650+
throw error502;
651+
});
652+
653+
mockGitHubRunners(runners);
654+
mockAwsRunners(runners);
655+
656+
// act
657+
await expect(scaleDown()).resolves.not.toThrow();
658+
659+
// assert - should NOT terminate since de-registration failed
660+
expect(terminateRunner).not.toHaveBeenCalled();
661+
});
662+
634663
const evictionStrategies = ['oldest_first', 'newest_first'];
635664
describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => {
636665
const defaultConfig = {

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,39 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127127
return launchTimePlusMinimum < now;
128128
}
129129

130+
async function deleteGitHubRunner(
131+
githubInstallationClient: Octokit,
132+
ec2runner: RunnerInfo,
133+
ghRunnerId: number,
134+
): Promise<{ ghRunnerId: number; status: number; success: boolean }> {
135+
try {
136+
let response;
137+
if (ec2runner.type === 'Org') {
138+
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromOrg({
139+
runner_id: ghRunnerId,
140+
org: ec2runner.owner,
141+
});
142+
} else {
143+
const [owner, repo] = ec2runner.owner.split('/');
144+
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromRepo({
145+
runner_id: ghRunnerId,
146+
owner,
147+
repo,
148+
});
149+
}
150+
return { ghRunnerId, status: response.status, success: response.status === 204 };
151+
} catch (error) {
152+
logger.error(
153+
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. ` +
154+
`Error: ${error instanceof Error ? error.message : String(error)}`,
155+
{ error },
156+
);
157+
return { ghRunnerId, status: 0, success: false };
158+
}
159+
}
160+
130161
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
131-
const githubAppClient = await getOrCreateOctokit(ec2runner);
162+
const githubInstallationClient = await getOrCreateOctokit(ec2runner);
132163
try {
133164
const runnerList = ec2runner as unknown as RunnerList;
134165
if (runnerList.bypassRemoval) {
@@ -141,33 +172,29 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
141172
const states = await Promise.all(
142173
ghRunnerIds.map(async (ghRunnerId) => {
143174
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
144-
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
175+
return await getGitHubRunnerBusyState(githubInstallationClient, ec2runner, ghRunnerId);
145176
}),
146177
);
147178

148179
if (states.every((busy) => busy === false)) {
149-
const statuses = await Promise.all(
150-
ghRunnerIds.map(async (ghRunnerId) => {
151-
return (
152-
ec2runner.type === 'Org'
153-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
154-
runner_id: ghRunnerId,
155-
org: ec2runner.owner,
156-
})
157-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
158-
runner_id: ghRunnerId,
159-
owner: ec2runner.owner.split('/')[0],
160-
repo: ec2runner.owner.split('/')[1],
161-
})
162-
).status;
163-
}),
180+
const results = await Promise.all(
181+
ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubInstallationClient, ec2runner, ghRunnerId)),
164182
);
165183

166-
if (statuses.every((status) => status == 204)) {
184+
const allSucceeded = results.every((r) => r.success);
185+
const failedRunners = results.filter((r) => !r.success);
186+
187+
if (allSucceeded) {
167188
await terminateRunner(ec2runner.instanceId);
168189
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
169190
} else {
170-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
191+
// Only terminate EC2 if we successfully de-registered from GitHub
192+
// Otherwise, leave the instance running so the next scale-down cycle can retry
193+
logger.error(
194+
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
195+
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
196+
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
197+
);
171198
}
172199
} else {
173200
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);

0 commit comments

Comments
 (0)