Skip to content

Commit 8599aac

Browse files
committed
fix: prevent EC2 termination when GitHub runner de-registration fails
When the scale-down Lambda fails to de-register a runner from GitHub (even after automatic retries via @octokit/plugin-retry), the EC2 instance should NOT be terminated. This prevents stale runner entries in GitHub org settings. This change complements PR #4990 which added @octokit/plugin-retry for automatic retries. While that handles transient failures, this ensures that if de-registration ultimately fails, we don't leave orphaned GitHub runner entries by terminating the EC2 instance prematurely. Key changes: - Extract deleteGitHubRunner() helper that catches errors per-runner - Only terminate EC2 instance if ALL GitHub de-registrations succeed - If any de-registration fails, leave instance running for next cycle - Rename githubAppClient to githubInstallationClient for clarity - Refactor to split owner/repo once instead of multiple times - Fix error logging to handle non-Error objects properly The @octokit/plugin-retry (added in #4990) handles automatic retries at the client level, so no custom retry logic is needed here. Tests: - Add test verifying EC2 is NOT terminated when de-registration fails
1 parent 6dc97d5 commit 8599aac

File tree

2 files changed

+93
-33
lines changed

2 files changed

+93
-33
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,35 @@ describe('Scale down runners', () => {
631631
await expect(scaleDown()).resolves.not.toThrow();
632632
});
633633

634+
it(`Should not terminate instance when de-registration throws an error.`, async () => {
635+
// setup - runner should NOT be terminated because de-registration fails
636+
const runners = [createRunnerTestData('idle-1', type, MINIMUM_TIME_RUNNING_IN_MINUTES + 1, true, false, false)];
637+
638+
const error502 = new RequestError('Server Error', 502, {
639+
request: {
640+
method: 'DELETE',
641+
url: 'https://api.github.com/test',
642+
headers: {},
643+
},
644+
});
645+
646+
mockOctokit.actions.deleteSelfHostedRunnerFromOrg.mockImplementation(() => {
647+
throw error502;
648+
});
649+
mockOctokit.actions.deleteSelfHostedRunnerFromRepo.mockImplementation(() => {
650+
throw error502;
651+
});
652+
653+
mockGitHubRunners(runners);
654+
mockAwsRunners(runners);
655+
656+
// act
657+
await expect(scaleDown()).resolves.not.toThrow();
658+
659+
// assert - should NOT terminate since de-registration failed
660+
expect(terminateRunner).not.toHaveBeenCalled();
661+
});
662+
634663
const evictionStrategies = ['oldest_first', 'newest_first'];
635664
describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => {
636665
const defaultConfig = {

lambdas/functions/control-plane/src/scale-runners/scale-down.ts

Lines changed: 64 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -103,17 +103,20 @@ async function listGitHubRunners(runner: RunnerInfo): Promise<GhRunners> {
103103

104104
logger.debug(`[listGithubRunners] Cache miss for ${key}`);
105105
const client = await getOrCreateOctokit(runner);
106-
const runners =
107-
runner.type === 'Org'
108-
? await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
109-
org: runner.owner,
110-
per_page: 100,
111-
})
112-
: await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
113-
owner: runner.owner.split('/')[0],
114-
repo: runner.owner.split('/')[1],
115-
per_page: 100,
116-
});
106+
let runners;
107+
if (runner.type === 'Org') {
108+
runners = await client.paginate(client.actions.listSelfHostedRunnersForOrg, {
109+
org: runner.owner,
110+
per_page: 100,
111+
});
112+
} else {
113+
const [owner, repo] = runner.owner.split('/');
114+
runners = await client.paginate(client.actions.listSelfHostedRunnersForRepo, {
115+
owner,
116+
repo,
117+
per_page: 100,
118+
});
119+
}
117120
githubCache.runners.set(key, runners);
118121
logger.debug(`[listGithubRunners] Cache set for ${key}`);
119122
logger.debug(`[listGithubRunners] Runners: ${JSON.stringify(runners)}`);
@@ -127,8 +130,39 @@ function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean {
127130
return launchTimePlusMinimum < now;
128131
}
129132

133+
async function deleteGitHubRunner(
134+
githubInstallationClient: Octokit,
135+
ec2runner: RunnerInfo,
136+
ghRunnerId: number,
137+
): Promise<{ ghRunnerId: number; status: number; success: boolean }> {
138+
try {
139+
let response;
140+
if (ec2runner.type === 'Org') {
141+
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromOrg({
142+
runner_id: ghRunnerId,
143+
org: ec2runner.owner,
144+
});
145+
} else {
146+
const [owner, repo] = ec2runner.owner.split('/');
147+
response = await githubInstallationClient.actions.deleteSelfHostedRunnerFromRepo({
148+
runner_id: ghRunnerId,
149+
owner,
150+
repo,
151+
});
152+
}
153+
return { ghRunnerId, status: response.status, success: response.status === 204 };
154+
} catch (error) {
155+
logger.error(
156+
`Failed to de-register GitHub runner ${ghRunnerId} for instance '${ec2runner.instanceId}'. ` +
157+
`Error: ${error instanceof Error ? error.message : String(error)}`,
158+
{ error },
159+
);
160+
return { ghRunnerId, status: 0, success: false };
161+
}
162+
}
163+
130164
async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promise<void> {
131-
const githubAppClient = await getOrCreateOctokit(ec2runner);
165+
const githubInstallationClient = await getOrCreateOctokit(ec2runner);
132166
try {
133167
const runnerList = ec2runner as unknown as RunnerList;
134168
if (runnerList.bypassRemoval) {
@@ -141,41 +175,38 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
141175
const states = await Promise.all(
142176
ghRunnerIds.map(async (ghRunnerId) => {
143177
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
144-
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
178+
return await getGitHubRunnerBusyState(githubInstallationClient, ec2runner, ghRunnerId);
145179
}),
146180
);
147181

148182
if (states.every((busy) => busy === false)) {
149-
const statuses = await Promise.all(
150-
ghRunnerIds.map(async (ghRunnerId) => {
151-
return (
152-
ec2runner.type === 'Org'
153-
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
154-
runner_id: ghRunnerId,
155-
org: ec2runner.owner,
156-
})
157-
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
158-
runner_id: ghRunnerId,
159-
owner: ec2runner.owner.split('/')[0],
160-
repo: ec2runner.owner.split('/')[1],
161-
})
162-
).status;
163-
}),
183+
const results = await Promise.all(
184+
ghRunnerIds.map((ghRunnerId) => deleteGitHubRunner(githubInstallationClient, ec2runner, ghRunnerId)),
164185
);
165186

166-
if (statuses.every((status) => status == 204)) {
187+
const allSucceeded = results.every((r) => r.success);
188+
const failedRunners = results.filter((r) => !r.success);
189+
190+
if (allSucceeded) {
167191
await terminateRunner(ec2runner.instanceId);
168192
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
169193
} else {
170-
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
194+
// Only terminate EC2 if we successfully de-registered from GitHub
195+
// Otherwise, leave the instance running so the next scale-down cycle can retry
196+
logger.error(
197+
`Failed to de-register ${failedRunners.length} GitHub runner(s) for instance '${ec2runner.instanceId}'. ` +
198+
`Instance will NOT be terminated to allow retry on next scale-down cycle. ` +
199+
`Failed runner IDs: ${failedRunners.map((r) => r.ghRunnerId).join(', ')}`,
200+
);
171201
}
172202
} else {
173203
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
174204
}
175205
} catch (e) {
176-
logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {
177-
error: e as Error,
178-
});
206+
logger.error(
207+
`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e instanceof Error ? e.message : String(e)}`,
208+
{ error: e },
209+
);
179210
}
180211
}
181212

0 commit comments

Comments
 (0)